From e3fb6475af8dc0c215835d89eb99f4159872284c Mon Sep 17 00:00:00 2001 From: Kevin De Porre Date: Thu, 11 Jun 2026 15:45:02 +0200 Subject: [PATCH] fix(agents-server-ui): show uncached input tokens in the meta row MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The input side of the token-usage label summed input + cacheRead + cacheWrite, so on warm-cache turns it re-counted the entire conversation on every step and read as a runaway cumulative number. Surface the uncached side instead (fresh input + cache writes, cache reads excluded) so the label reflects the new work each response did. Steps recorded before this change keep their stored totals — the fields are optional and the display sums what's persisted, so no migration is needed. Co-Authored-By: Claude Fable 5 --- .changeset/uncached-input-tokens.md | 19 ++++++++++++ packages/agents-runtime/src/entity-schema.ts | 3 ++ .../agents-runtime/src/entity-timeline.ts | 6 ++-- .../agents-runtime/src/outbound-bridge.ts | 3 ++ packages/agents-runtime/src/pi-adapter.ts | 31 ++++++++++--------- .../agents-runtime/test/pi-adapter.test.ts | 13 +++++--- .../src/components/TokenUsage.tsx | 5 +++ 7 files changed, 58 insertions(+), 22 deletions(-) create mode 100644 .changeset/uncached-input-tokens.md diff --git a/.changeset/uncached-input-tokens.md b/.changeset/uncached-input-tokens.md new file mode 100644 index 0000000000..ba783b0b5a --- /dev/null +++ b/.changeset/uncached-input-tokens.md @@ -0,0 +1,19 @@ +--- +'@electric-ax/agents-server-ui': patch +'@electric-ax/agents-runtime': patch +--- + +Show only uncached input tokens in the per-response token usage label. + +The input side previously summed `input + cacheRead + cacheWrite`, so +on warm-cache turns the meta row re-counted the entire conversation on +every step and ballooned into a cumulative number that said nothing +about the work the response actually did. The adapter now surfaces the +uncached side only — fresh prompt tokens plus cache writes, with +prompt-cache reads excluded. (`cacheWrite` is counted because +cache-enabled providers report newly appended prompt tokens there, +with `input` collapsing to ~0.) + +Steps recorded before this change keep their stored cache-inclusive +totals — both step fields are optional and the display just sums +what's persisted, so no migration is needed. diff --git a/packages/agents-runtime/src/entity-schema.ts b/packages/agents-runtime/src/entity-schema.ts index 0db9426126..60a9299d2b 100644 --- a/packages/agents-runtime/src/entity-schema.ts +++ b/packages/agents-runtime/src/entity-schema.ts @@ -160,6 +160,9 @@ type StepValue = { // end-of-message `usage` payload. Populated on `onStepEnd` when the // adapter has the data — older events without these fields stay // valid (both optional), so this is a strictly additive change. + // `input_tokens` is the *uncached* input side (fresh tokens plus + // cache writes; cache reads excluded) — the cache-inclusive total + // would re-count the whole conversation on every step. input_tokens?: number output_tokens?: number } diff --git a/packages/agents-runtime/src/entity-timeline.ts b/packages/agents-runtime/src/entity-timeline.ts index 116818b246..c76d34675b 100644 --- a/packages/agents-runtime/src/entity-timeline.ts +++ b/packages/agents-runtime/src/entity-timeline.ts @@ -62,8 +62,10 @@ export type EntityTimelineSection = done?: true error?: string // Summed across all steps of the run that produced this section. - // Either side may be missing if the provider didn't report it - // (e.g. older events recorded before tokens were persisted). + // `input` is the uncached side only (fresh tokens + cache writes) + // — see `StepValue.input_tokens`. Either side may be missing if + // the provider didn't report it (e.g. older events recorded + // before tokens were persisted). tokens?: { input?: number output?: number diff --git a/packages/agents-runtime/src/outbound-bridge.ts b/packages/agents-runtime/src/outbound-bridge.ts index 4ba16ace04..f603610fdd 100644 --- a/packages/agents-runtime/src/outbound-bridge.ts +++ b/packages/agents-runtime/src/outbound-bridge.ts @@ -104,6 +104,9 @@ export interface OutboundBridge { onStepStart: (opts?: { modelProvider?: string; modelId?: string }) => void onStepEnd: (opts?: { finishReason?: string + // Uncached input side only (fresh prompt tokens + cache writes; + // prompt-cache *reads* excluded) — the cache-inclusive total would + // re-count the whole conversation on every warm-cache step. tokenInput?: number tokenOutput?: number durationMs?: number diff --git a/packages/agents-runtime/src/pi-adapter.ts b/packages/agents-runtime/src/pi-adapter.ts index b1a4ccd65c..3fdc38ed18 100644 --- a/packages/agents-runtime/src/pi-adapter.ts +++ b/packages/agents-runtime/src/pi-adapter.ts @@ -379,19 +379,24 @@ export function createPiAgentAdapter( // `cacheRead` (prompt-cache hits — typically the // system prompt + prior history once the cache is // warm) and `cacheWrite` (tokens added to the cache - // this turn). What the user wants in the meta row is - // the total prompt volume the model actually saw, so - // we sum every side that arrived as a number. Reading - // only `usage.input` undercounts massively on second+ - // turns where most of the prompt hits the cache and - // `usage.input` collapses to a handful of tokens. + // this turn). The meta row shows the *uncached* input + // — `input + cacheWrite` — i.e. the new prompt work + // this step did. `cacheRead` is deliberately excluded: + // it re-counts the entire conversation on every warm + // turn, so including it balloons the label into a + // cumulative number that says nothing about this + // response. `cacheWrite` IS counted: cache-enabled + // providers report newly appended prompt tokens there + // (with `input` collapsing to ~0), so excluding it + // would surface tiny "3 input" labels instead. // // `inputTokens` / `outputTokens` are legacy flat // aliases (kept as a fallback for non-pi-ai providers - // that don't split the cache columns). We deliberately - // do NOT coerce a missing side to `0` — doing so - // would be indistinguishable from a real zero-token - // step in the meta row, and the query-layer + // that don't split the cache columns); with no cache + // split, the whole side counts as uncached. We + // deliberately do NOT coerce a missing side to `0` — + // doing so would be indistinguishable from a real + // zero-token step in the meta row, and the query-layer // `count(...)` aggregate would mark the side as // present when it really isn't. const sumPresentNumbers = ( @@ -408,11 +413,7 @@ export function createPiAgentAdapter( return saw ? total : undefined } const usageInput = - sumPresentNumbers([ - usage?.input, - usage?.cacheRead, - usage?.cacheWrite, - ]) ?? + sumPresentNumbers([usage?.input, usage?.cacheWrite]) ?? (typeof usage?.inputTokens === `number` ? usage.inputTokens : undefined) diff --git a/packages/agents-runtime/test/pi-adapter.test.ts b/packages/agents-runtime/test/pi-adapter.test.ts index 91748e899a..4bd4f2671a 100644 --- a/packages/agents-runtime/test/pi-adapter.test.ts +++ b/packages/agents-runtime/test/pi-adapter.test.ts @@ -934,11 +934,14 @@ describe(`toAgentHistory`, () => { expect(stepValue?.output_tokens).toBe(567) }) - it(`sums input + cacheRead + cacheWrite into the input token total`, async () => { + it(`sums input + cacheWrite (cache reads excluded) into the input tokens`, async () => { // Anthropic + other prompt-cache providers split input across - // three counters; reading only `usage.input` would surface - // tiny "3 input" labels on cache-warm turns. The adapter sums - // all three so the meta row reflects the real prompt volume. + // three counters. The adapter surfaces the *uncached* side — + // fresh tokens plus cache writes. `cacheRead` re-counts the + // entire history on every warm turn, so including it would make + // the meta row a runaway cumulative number; `cacheWrite` must be + // counted because cache-enabled providers report newly appended + // prompt tokens there (with `input` collapsing to ~0). const events = await runOnce( makeCompletedMessage({ input: 50, @@ -948,7 +951,7 @@ describe(`toAgentHistory`, () => { }) ) const stepValue = findStepUpdate(events) - expect(stepValue?.input_tokens).toBe(1350) + expect(stepValue?.input_tokens).toBe(150) expect(stepValue?.output_tokens).toBe(80) }) diff --git a/packages/agents-server-ui/src/components/TokenUsage.tsx b/packages/agents-server-ui/src/components/TokenUsage.tsx index 7e515693b6..6e1563d930 100644 --- a/packages/agents-server-ui/src/components/TokenUsage.tsx +++ b/packages/agents-server-ui/src/components/TokenUsage.tsx @@ -9,6 +9,11 @@ import styles from './TokenUsage.module.css' * jittering as numbers tick up (input grows when a tool result is * fed back; output grows when the model streams a new step). * + * `input` is the uncached input side only — fresh prompt tokens plus + * cache writes, with prompt-cache *reads* excluded. The cache-inclusive + * total re-counts the entire history on every step, so it balloons into + * a cumulative number that says nothing about the work this response did. + * * Either side may be `undefined` (the provider didn't emit it, or * the section is historical and was recorded before tokens were * persisted) — we skip the missing half rather than print `0`.