From e3fb6475af8dc0c215835d89eb99f4159872284c Mon Sep 17 00:00:00 2001
From: Kevin De Porre <kevin@electric-sql.com>
Date: Thu, 11 Jun 2026 15:45:02 +0200
Subject: [PATCH] fix(agents-server-ui): show uncached input tokens in the meta
 row
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The input side of the token-usage label summed input + cacheRead +
cacheWrite, so on warm-cache turns it re-counted the entire
conversation on every step and read as a runaway cumulative number.
Surface the uncached side instead (fresh input + cache writes, cache
reads excluded) so the label reflects the new work each response did.

Steps recorded before this change keep their stored totals — the
fields are optional and the display sums what's persisted, so no
migration is needed.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .changeset/uncached-input-tokens.md           | 19 ++++++++++++
 packages/agents-runtime/src/entity-schema.ts  |  3 ++
 .../agents-runtime/src/entity-timeline.ts     |  6 ++--
 .../agents-runtime/src/outbound-bridge.ts     |  3 ++
 packages/agents-runtime/src/pi-adapter.ts     | 31 ++++++++++---------
 .../agents-runtime/test/pi-adapter.test.ts    | 13 +++++---
 .../src/components/TokenUsage.tsx             |  5 +++
 7 files changed, 58 insertions(+), 22 deletions(-)
 create mode 100644 .changeset/uncached-input-tokens.md

diff --git a/.changeset/uncached-input-tokens.md b/.changeset/uncached-input-tokens.md
new file mode 100644
index 0000000000..ba783b0b5a
--- /dev/null
+++ b/.changeset/uncached-input-tokens.md
@@ -0,0 +1,19 @@
+---
+'@electric-ax/agents-server-ui': patch
+'@electric-ax/agents-runtime': patch
+---
+
+Show only uncached input tokens in the per-response token usage label.
+
+The input side previously summed `input + cacheRead + cacheWrite`, so
+on warm-cache turns the meta row re-counted the entire conversation on
+every step and ballooned into a cumulative number that said nothing
+about the work the response actually did. The adapter now surfaces the
+uncached side only — fresh prompt tokens plus cache writes, with
+prompt-cache reads excluded. (`cacheWrite` is counted because
+cache-enabled providers report newly appended prompt tokens there,
+with `input` collapsing to ~0.)
+
+Steps recorded before this change keep their stored cache-inclusive
+totals — both step fields are optional and the display just sums
+what's persisted, so no migration is needed.
diff --git a/packages/agents-runtime/src/entity-schema.ts b/packages/agents-runtime/src/entity-schema.ts
index 0db9426126..60a9299d2b 100644
--- a/packages/agents-runtime/src/entity-schema.ts
+++ b/packages/agents-runtime/src/entity-schema.ts
@@ -160,6 +160,9 @@ type StepValue = {
   // end-of-message `usage` payload. Populated on `onStepEnd` when the
   // adapter has the data — older events without these fields stay
   // valid (both optional), so this is a strictly additive change.
+  // `input_tokens` is the *uncached* input side (fresh tokens plus
+  // cache writes; cache reads excluded) — the cache-inclusive total
+  // would re-count the whole conversation on every step.
   input_tokens?: number
   output_tokens?: number
 }
diff --git a/packages/agents-runtime/src/entity-timeline.ts b/packages/agents-runtime/src/entity-timeline.ts
index 116818b246..c76d34675b 100644
--- a/packages/agents-runtime/src/entity-timeline.ts
+++ b/packages/agents-runtime/src/entity-timeline.ts
@@ -62,8 +62,10 @@ export type EntityTimelineSection =
       done?: true
       error?: string
       // Summed across all steps of the run that produced this section.
-      // Either side may be missing if the provider didn't report it
-      // (e.g. older events recorded before tokens were persisted).
+      // `input` is the uncached side only (fresh tokens + cache writes)
+      // — see `StepValue.input_tokens`. Either side may be missing if
+      // the provider didn't report it (e.g. older events recorded
+      // before tokens were persisted).
       tokens?: {
         input?: number
         output?: number
diff --git a/packages/agents-runtime/src/outbound-bridge.ts b/packages/agents-runtime/src/outbound-bridge.ts
index 4ba16ace04..f603610fdd 100644
--- a/packages/agents-runtime/src/outbound-bridge.ts
+++ b/packages/agents-runtime/src/outbound-bridge.ts
@@ -104,6 +104,9 @@ export interface OutboundBridge {
   onStepStart: (opts?: { modelProvider?: string; modelId?: string }) => void
   onStepEnd: (opts?: {
     finishReason?: string
+    // Uncached input side only (fresh prompt tokens + cache writes;
+    // prompt-cache *reads* excluded) — the cache-inclusive total would
+    // re-count the whole conversation on every warm-cache step.
     tokenInput?: number
     tokenOutput?: number
     durationMs?: number
diff --git a/packages/agents-runtime/src/pi-adapter.ts b/packages/agents-runtime/src/pi-adapter.ts
index b1a4ccd65c..3fdc38ed18 100644
--- a/packages/agents-runtime/src/pi-adapter.ts
+++ b/packages/agents-runtime/src/pi-adapter.ts
@@ -379,19 +379,24 @@ export function createPiAgentAdapter(
                 // `cacheRead` (prompt-cache hits — typically the
                 // system prompt + prior history once the cache is
                 // warm) and `cacheWrite` (tokens added to the cache
-                // this turn). What the user wants in the meta row is
-                // the total prompt volume the model actually saw, so
-                // we sum every side that arrived as a number. Reading
-                // only `usage.input` undercounts massively on second+
-                // turns where most of the prompt hits the cache and
-                // `usage.input` collapses to a handful of tokens.
+                // this turn). The meta row shows the *uncached* input
+                // — `input + cacheWrite` — i.e. the new prompt work
+                // this step did. `cacheRead` is deliberately excluded:
+                // it re-counts the entire conversation on every warm
+                // turn, so including it balloons the label into a
+                // cumulative number that says nothing about this
+                // response. `cacheWrite` IS counted: cache-enabled
+                // providers report newly appended prompt tokens there
+                // (with `input` collapsing to ~0), so excluding it
+                // would surface tiny "3 input" labels instead.
                 //
                 // `inputTokens` / `outputTokens` are legacy flat
                 // aliases (kept as a fallback for non-pi-ai providers
-                // that don't split the cache columns). We deliberately
-                // do NOT coerce a missing side to `0` — doing so
-                // would be indistinguishable from a real zero-token
-                // step in the meta row, and the query-layer
+                // that don't split the cache columns); with no cache
+                // split, the whole side counts as uncached. We
+                // deliberately do NOT coerce a missing side to `0` —
+                // doing so would be indistinguishable from a real
+                // zero-token step in the meta row, and the query-layer
                 // `count(...)` aggregate would mark the side as
                 // present when it really isn't.
                 const sumPresentNumbers = (
@@ -408,11 +413,7 @@ export function createPiAgentAdapter(
                   return saw ? total : undefined
                 }
                 const usageInput =
-                  sumPresentNumbers([
-                    usage?.input,
-                    usage?.cacheRead,
-                    usage?.cacheWrite,
-                  ]) ??
+                  sumPresentNumbers([usage?.input, usage?.cacheWrite]) ??
                   (typeof usage?.inputTokens === `number`
                     ? usage.inputTokens
                     : undefined)
diff --git a/packages/agents-runtime/test/pi-adapter.test.ts b/packages/agents-runtime/test/pi-adapter.test.ts
index 91748e899a..4bd4f2671a 100644
--- a/packages/agents-runtime/test/pi-adapter.test.ts
+++ b/packages/agents-runtime/test/pi-adapter.test.ts
@@ -934,11 +934,14 @@ describe(`toAgentHistory`, () => {
       expect(stepValue?.output_tokens).toBe(567)
     })
 
-    it(`sums input + cacheRead + cacheWrite into the input token total`, async () => {
+    it(`sums input + cacheWrite (cache reads excluded) into the input tokens`, async () => {
       // Anthropic + other prompt-cache providers split input across
-      // three counters; reading only `usage.input` would surface
-      // tiny "3 input" labels on cache-warm turns. The adapter sums
-      // all three so the meta row reflects the real prompt volume.
+      // three counters. The adapter surfaces the *uncached* side —
+      // fresh tokens plus cache writes. `cacheRead` re-counts the
+      // entire history on every warm turn, so including it would make
+      // the meta row a runaway cumulative number; `cacheWrite` must be
+      // counted because cache-enabled providers report newly appended
+      // prompt tokens there (with `input` collapsing to ~0).
       const events = await runOnce(
         makeCompletedMessage({
           input: 50,
@@ -948,7 +951,7 @@ describe(`toAgentHistory`, () => {
         })
       )
       const stepValue = findStepUpdate(events)
-      expect(stepValue?.input_tokens).toBe(1350)
+      expect(stepValue?.input_tokens).toBe(150)
       expect(stepValue?.output_tokens).toBe(80)
     })
 
diff --git a/packages/agents-server-ui/src/components/TokenUsage.tsx b/packages/agents-server-ui/src/components/TokenUsage.tsx
index 7e515693b6..6e1563d930 100644
--- a/packages/agents-server-ui/src/components/TokenUsage.tsx
+++ b/packages/agents-server-ui/src/components/TokenUsage.tsx
@@ -9,6 +9,11 @@ import styles from './TokenUsage.module.css'
  * jittering as numbers tick up (input grows when a tool result is
  * fed back; output grows when the model streams a new step).
  *
+ * `input` is the uncached input side only — fresh prompt tokens plus
+ * cache writes, with prompt-cache *reads* excluded. The cache-inclusive
+ * total re-counts the entire history on every step, so it balloons into
+ * a cumulative number that says nothing about the work this response did.
+ *
  * Either side may be `undefined` (the provider didn't emit it, or
  * the section is historical and was recorded before tokens were
  * persisted) — we skip the missing half rather than print `0`.