diff --git a/.changeset/reasoning-content.md b/.changeset/reasoning-content.md new file mode 100644 index 0000000000..4753fa1cbd --- /dev/null +++ b/.changeset/reasoning-content.md @@ -0,0 +1,49 @@ +--- +'@electric-ax/agents-server-ui': minor +'@electric-ax/agents-runtime': minor +'@electric-ax/agents': patch +'@electric-ax/agents-desktop': patch +--- + +Stream model reasoning / extended-thinking content into the UI. While +the model is "thinking" (Anthropic extended thinking, DeepSeek-R1 +reasoning, Moonshot K2, OpenAI Responses summaries) the agent response +now shows the live reasoning text faded above the answer, with the +existing `Thinking` shimmer heading and an elapsed-time ticker. Once +the reasoning settles it collapses to `▸ Thought for 12s` — click to +expand. Multiple reasoning rows per run are rendered independently in +order, so tool-using turns show each step's reasoning separately. + +Implementation: + +- **Schema** — `reasoning` row gains `run_id`, `encrypted` (Anthropic + redacted-thinking opaque payload, must round-trip back to the model + verbatim), and `summary_title` (extracted at write time for + providers that emit a bolded heading). New `reasoningDeltas` + collection mirrors `textDeltas` for streamed content. +- **Bridge** — `OutboundBridge` gains `onReasoningStart` / + `onReasoningDelta` / `onReasoningEnd`, parallel to the text path. +- **Adapter** — `pi-adapter.ts` routes pi-ai's `thinking_start` / + `thinking_delta` / `thinking_end` events to the bridge, parses the + `**Title**\n\n` heading (OpenAI Responses only) once at + `thinking_end` so the UI doesn't re-parse on every render. +- **Timeline** — `EntityTimelineRunRow` gains a live + `reasoning: Collection` with content + built from a delta-join, mirroring `EntityTimelineTextItem`. +- **UI** — New `` component renders above the + answer in `AgentResponseLive`. Live shows faded markdown via + `Streamdown` with `ThinkingIndicator` heading + summary title + + elapsed-time ticker. Settled collapses to `Thought for Ns` with + click-to-expand. Redacted Anthropic blocks render a single muted + line — content is opaque, but the encrypted payload is still + persisted server-side so the model gets it back next turn. + +Providers without reasoning emit nothing → no reasoning section +rendered. Historical responses recorded before this PR have no +closure cue, same as today. + +Anthropic extended thinking is now always-on for reasoning-capable +models: `reasoningEffort: auto` maps to the minimal budget +(1024 tokens), matching the OpenAI branch where `auto` already +defaulted to `minimal`. Explicit `low`/`medium`/`high` scale the +budget as before. diff --git a/packages/agents-runtime/src/entity-schema.ts b/packages/agents-runtime/src/entity-schema.ts index 0db9426126..6a2c6630c1 100644 --- a/packages/agents-runtime/src/entity-schema.ts +++ b/packages/agents-runtime/src/entity-schema.ts @@ -187,7 +187,24 @@ type ToolCallValue = { } type ReasoningValue = { key?: string + run_id?: string status: `streaming` | `completed` + // Anthropic emits "redacted thinking" content blocks the client can't + // display but MUST round-trip back to the model on the next turn or + // the conversation errors. Persist verbatim, render nothing. + encrypted?: string + // OpenAI's Responses API surfaces reasoning with a bolded title line + // (`**Inspecting PR workflow**\n\n`). We split it out at write + // time so the UI can drive a separate heading without re-parsing on + // every render. Empty / absent for providers that don't emit titles + // (Anthropic, DeepSeek-R1, Moonshot K2). + summary_title?: string +} +type ReasoningDeltaValue = { + key?: string + reasoning_id: string + run_id: string + delta: string } type ErrorEventValue = { key?: string @@ -530,7 +547,20 @@ function createReasoningSchema(): Schema { return z.object({ key: z.string().optional(), ...timelineOrderField, + run_id: z.string().optional(), status: z.enum([`streaming`, `completed`]), + encrypted: z.string().optional(), + summary_title: z.string().optional(), + }) +} + +function createReasoningDeltaSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + reasoning_id: z.string(), + run_id: z.string(), + delta: z.string(), }) } @@ -861,6 +891,7 @@ export type Text = SequencedPersistedRow export type TextDelta = SequencedPersistedRow export type ToolCall = SequencedPersistedRow export type Reasoning = SequencedPersistedRow +export type ReasoningDelta = SequencedPersistedRow export type ErrorEvent = SequencedPersistedRow export type MessageReceived = SequencedPersistedRow export type WakeEntry = SequencedPersistedRow @@ -951,6 +982,7 @@ export const ENTITY_COLLECTIONS = { textDeltas: `textDeltas`, toolCalls: `toolCalls`, reasoning: `reasoning`, + reasoningDeltas: `reasoningDeltas`, errors: `errors`, inbox: `inbox`, wakes: `wakes`, @@ -975,6 +1007,8 @@ export const BUILT_IN_EVENT_SCHEMAS = { tool_call: createToolCallSchema() as unknown as BuiltInEntitySchema, reasoning: createReasoningSchema() as unknown as BuiltInEntitySchema, + reasoning_delta: + createReasoningDeltaSchema() as unknown as BuiltInEntitySchema, error: createErrorEventSchema() as unknown as BuiltInEntitySchema, inbox: createMessageReceivedSchema() as unknown as BuiltInEntitySchema, @@ -1010,6 +1044,7 @@ type EntityCollectionsDefinition = { textDeltas: CollectionDefinition toolCalls: CollectionDefinition reasoning: CollectionDefinition + reasoningDeltas: CollectionDefinition errors: CollectionDefinition inbox: CollectionDefinition wakes: CollectionDefinition @@ -1062,6 +1097,12 @@ export const builtInCollections: EntityCollectionsDefinition = { type: `reasoning`, primaryKey: `key`, }, + reasoningDeltas: { + schema: + BUILT_IN_EVENT_SCHEMAS.reasoning_delta as StandardSchemaV1, + type: `reasoning_delta`, + primaryKey: `key`, + }, errors: { schema: BUILT_IN_EVENT_SCHEMAS.error as StandardSchemaV1, type: `error`, diff --git a/packages/agents-runtime/src/entity-timeline.ts b/packages/agents-runtime/src/entity-timeline.ts index 116818b246..6fed8bf8ca 100644 --- a/packages/agents-runtime/src/entity-timeline.ts +++ b/packages/agents-runtime/src/entity-timeline.ts @@ -242,6 +242,24 @@ export type EntityTimelineRunItem = toolCall: EntityTimelineToolCallItem } +export interface EntityTimelineReasoningItem { + key: string + run_id?: string + order: TimelineOrder + status: `streaming` | `completed` + // The concatenated `reasoning_delta` content lives under + // `body.content` rather than top-level — the wrapper is what + // forces TanStack DB to materialize the include before the row + // reaches `useLiveQuery`. See the timeline-query comment. + body?: { content: string } + // Optional bolded title parsed at write time — only OpenAI Responses + // emits these; null for Anthropic / DeepSeek / Moonshot. + summary_title?: string + // Anthropic redacted-thinking opaque payload. Persist verbatim so we + // can echo it back on the next turn; the UI shows a placeholder. + encrypted?: string +} + export interface EntityTimelineStepItem { key: string run_id?: string @@ -267,6 +285,7 @@ export interface EntityTimelineRunRow { status: `started` | `completed` | `failed` finish_reason?: string items: Collection + reasoning: Collection steps: Collection errors: Collection // Per-run token totals summed across all `steps` of the run. @@ -1385,6 +1404,13 @@ function buildEntityTimelineQuery( run_id: error.run_id, })) + // Union texts + tool calls into a single ordered stream. The + // text-delta join lives at this level (vs. inside the consumer's + // `items.select`) so the correlation key is `text.key` — a field + // on the raw text row — rather than a projected scalar. The only + // delta-join alias constraint is that it must NOT collide with + // the `chunk` alias used in the reasoning content sub-query + // below; that's why this one is `textChunk`. const runItemsSource = q .unionAll({ text: db.collections.texts, @@ -1402,11 +1428,13 @@ function buildEntityTimelineQuery( textContent: concat( toArray( q - .from({ chunk: db.collections.textDeltas }) - .where(({ chunk }) => eq(chunk.text_id, text.key)) - .orderBy(({ chunk }) => coalesce(chunk._timeline_order, `~`)) - .orderBy(({ chunk }) => chunk.key) - .select(({ chunk }) => chunk.delta) + .from({ textChunk: db.collections.textDeltas }) + .where(({ textChunk }) => eq(textChunk.text_id, text.key)) + .orderBy(({ textChunk }) => + coalesce(textChunk._timeline_order, `~`) + ) + .orderBy(({ textChunk }) => textChunk.key) + .select(({ textChunk }) => textChunk.delta) ) ), toolCall: caseWhen(toolCall.key, { @@ -1422,6 +1450,43 @@ function buildEntityTimelineQuery( }), })) + // Mirror `runItemsSource`'s shape for reasoning rows: the + // `concat(toArray(...))` include is *defined* on this top-level + // source, then the `reasoning:` consumer inside `runSource.select` + // below dereferences it into `content: r.reasoningContent`. The + // two-layer source/consumer split is load-bearing: `useLiveQuery` + // reads of a sub-collection that has an include co-defined in the + // same select return the row with `content: null` + a deferred + // `Symbol(includesRouting)` marker. Naming the include field in a + // downstream `.select` is what forces materialization — exactly + // how `items.text.content` pulls `item.textContent` out of + // `runItemsSource`. Alias is `reasoningChunk` to avoid colliding + // with `textChunk` used above. + const runReasoningSource = q + .from({ reasoning: db.collections.reasoning }) + .select(({ reasoning }) => ({ + key: reasoning.key, + run_id: reasoning.run_id, + order: coalesce(reasoning._timeline_order, `~`), + status: reasoning.status, + summary_title: reasoning.summary_title, + encrypted: reasoning.encrypted, + reasoningContent: concat( + toArray( + q + .from({ reasoningChunk: db.collections.reasoningDeltas }) + .where(({ reasoningChunk }) => + eq(reasoningChunk.reasoning_id, reasoning.key) + ) + .orderBy(({ reasoningChunk }) => + coalesce(reasoningChunk._timeline_order, `~`) + ) + .orderBy(({ reasoningChunk }) => reasoningChunk.key) + .select(({ reasoningChunk }) => reasoningChunk.delta) + ) + ), + })) + const runTokensSource = q .from({ step: db.collections.steps }) .groupBy(({ step }) => step.run_id) @@ -1484,6 +1549,28 @@ function buildEntityTimelineQuery( }), toolCall: item.toolCall, })), + reasoning: q + .from({ r: runReasoningSource }) + .where(({ r }) => eq(r.run_id, run.key)) + .orderBy(({ r }) => r.order) + .orderBy(({ r }) => r.key) + .select(({ r }) => ({ + key: r.key, + run_id: r.run_id, + order: r.order, + status: r.status, + // Wrap the include reference inside a `caseWhen` object body + // — the same construct items uses to materialize + // `item.textContent` into `text.content`. Bare top-level + // references leave the include deferred until UI reads it + // through `useLiveQuery`, which never gets through. UI reads + // `entry.body?.content` instead of `entry.content`. + body: caseWhen(r.key, { + content: r.reasoningContent, + }), + summary_title: r.summary_title, + encrypted: r.encrypted, + })), steps: q .from({ step: db.collections.steps }) .where(({ step }) => eq(step.run_id, run.key)) diff --git a/packages/agents-runtime/src/outbound-bridge.ts b/packages/agents-runtime/src/outbound-bridge.ts index 75eb1400f4..0fcc2cd2ed 100644 --- a/packages/agents-runtime/src/outbound-bridge.ts +++ b/packages/agents-runtime/src/outbound-bridge.ts @@ -7,6 +7,7 @@ interface IdCounters { step: number msg: number tc: number + reasoning: number deltaSeqs: Map } @@ -15,6 +16,7 @@ export interface OutboundIdSeed { step: number msg: number tc: number + reasoning: number cacheKey?: string } @@ -42,12 +44,13 @@ function scanCounters(events: Array): IdCounters { step: 0, msg: 0, tc: 0, + reasoning: 0, deltaSeqs: new Map(), } for (const ev of events) { if (!ev.key) continue - const match = ev.key.match(/^(run|step|msg|tc)-(\d+)/) + const match = ev.key.match(/^(run|step|msg|tc|reasoning)-(\d+)/) if (!match) continue const prefix = match[1] as keyof Omit const nextId = parseInt(match[2]!, 10) + 1 @@ -64,6 +67,7 @@ export async function loadOutboundIdSeed( const steps = db.collections.steps.toArray const texts = db.collections.texts.toArray const toolCalls = db.collections.toolCalls.toArray + const reasoning = db.collections.reasoning.toArray const runsCollectionId = db.collections.runs.id const dbSeed = { @@ -83,6 +87,10 @@ export async function loadOutboundIdSeed( toolCalls.map((toolCall) => toolCall.key), `tc` ), + reasoning: nextCounterFromKeys( + reasoning.map((r) => r.key), + `reasoning` + ), } const cachedSeed = outboundIdSeedCache.get(runsCollectionId) const seed: OutboundIdSeed = { @@ -90,6 +98,7 @@ export async function loadOutboundIdSeed( step: Math.max(dbSeed.step, cachedSeed?.step ?? 0), msg: Math.max(dbSeed.msg, cachedSeed?.msg ?? 0), tc: Math.max(dbSeed.tc, cachedSeed?.tc ?? 0), + reasoning: Math.max(dbSeed.reasoning, cachedSeed?.reasoning ?? 0), cacheKey: runsCollectionId, } outboundIdSeedCache.set(runsCollectionId, seed) @@ -110,6 +119,23 @@ export interface OutboundBridge { onTextStart: () => void onTextDelta: (delta: string) => void onTextEnd: () => void + // Reasoning / extended-thinking stream. Mirrors the text path: + // start opens a row, delta(s) append to a paired `reasoningDeltas` + // collection, end closes the row. + // + // `opts.encrypted` on end handles Anthropic's `redacted_thinking` + // content blocks — opaque payloads the client can't display but + // must round-trip back to the model verbatim on the next turn or + // the conversation errors. Persist as-is, render nothing. + // + // `opts.summaryTitle` (currently OpenAI Responses only — emitted + // as a bolded first line `**Inspecting PR workflow**\n\n`) + // is extracted at write time so the UI can drive a separate + // heading without re-parsing on every render. Skip for providers + // that don't emit titles (Anthropic, DeepSeek-R1, Moonshot K2). + onReasoningStart: () => void + onReasoningDelta: (delta: string) => void + onReasoningEnd: (opts?: { encrypted?: string; summaryTitle?: string }) => void onToolCallStart(toolCallId: string, name: string, args: unknown): void onToolCallStart(name: string, args: unknown): void onToolCallEnd( @@ -143,6 +169,7 @@ export function createOutboundBridge( step: counters.step, msg: counters.msg, tc: counters.tc, + reasoning: counters.reasoning, cacheKey, }) } @@ -152,6 +179,8 @@ export function createOutboundBridge( let currentStepNumber = 0 let currentMsgKey: string | null = null let currentTextRunKey: string | null = null + let currentReasoningKey: string | null = null + let currentReasoningRunKey: string | null = null const toolCallsById = new Map< string, { key: string; runKey: string; args: unknown } @@ -283,6 +312,56 @@ export function createOutboundBridge( ) }, + onReasoningStart() { + const runKey = requireActiveRun(`onReasoningStart`) + currentReasoningKey = `reasoning-${counters.reasoning++}` + persistSeed() + currentReasoningRunKey = runKey + counters.deltaSeqs.set(currentReasoningKey, 0) + writeEvent( + entityStateSchema.reasoning.insert({ + key: currentReasoningKey, + value: { status: `streaming`, run_id: runKey } as never, + }) as ChangeEvent + ) + }, + + onReasoningDelta(delta: string) { + if (!currentReasoningKey) return + const runKey = requireActiveRun(`onReasoningDelta`) + const seq = counters.deltaSeqs.get(currentReasoningKey) ?? 0 + counters.deltaSeqs.set(currentReasoningKey, seq + 1) + writeEvent( + entityStateSchema.reasoningDeltas.insert({ + key: `${currentReasoningKey}:${seq}`, + value: { + reasoning_id: currentReasoningKey, + run_id: runKey, + delta, + } as never, + }) as ChangeEvent + ) + }, + + onReasoningEnd(opts?: { encrypted?: string; summaryTitle?: string }) { + if (!currentReasoningKey) return + writeEvent( + entityStateSchema.reasoning.update({ + key: currentReasoningKey, + value: { + status: `completed`, + run_id: currentReasoningRunKey, + ...(opts?.encrypted !== undefined && { encrypted: opts.encrypted }), + ...(opts?.summaryTitle !== undefined && { + summary_title: opts.summaryTitle, + }), + } as never, + }) as ChangeEvent + ) + currentReasoningKey = null + currentReasoningRunKey = null + }, + onToolCallStart( toolCallIdOrName: string, nameOrArgs: string | unknown, diff --git a/packages/agents-runtime/src/pi-adapter.ts b/packages/agents-runtime/src/pi-adapter.ts index ab823f4304..890f7fb58a 100644 --- a/packages/agents-runtime/src/pi-adapter.ts +++ b/packages/agents-runtime/src/pi-adapter.ts @@ -28,6 +28,33 @@ import type { } from '@mariozechner/pi-ai' import type { LLMContentBlock, LLMMessage, LLMMessageContent } from './types' +/** + * Split a streamed reasoning blob into `{ title, body }`. + * + * OpenAI's Responses API surfaces reasoning summaries with a bolded + * first line — `**Inspecting PR workflow**\n\n` — which we want + * to drive a separate heading in the UI rather than render inline. + * Anthropic / DeepSeek-R1 / Moonshot K2 don't emit titles; for them + * the regex doesn't match and `title` stays `null`. + * + * Match is anchored to the start, requires a blank-line terminator + * (so partial titles mid-stream don't get prematurely promoted), and + * forbids `*` or newline inside the title (so we don't accidentally + * eat bolded emphasis later in the text). + */ +function parseReasoningSummary(text: string): { + title: string | null + body: string +} { + const content = text.trim() + const match = content.match(/^\*\*([^*\n]+)\*\*(?:\r?\n\r?\n|$)/) + if (!match) return { title: null, body: content } + return { + title: match[1]!.trim(), + body: content.slice(match[0].length).trimEnd(), + } +} + // ============================================================================ // Options // ============================================================================ @@ -221,6 +248,8 @@ export function createPiAgentAdapter( let disposed = false let stepStartTime = 0 let textStarted = false + let reasoningStarted = false + let reasoningAccum = `` let abortedRun = false const model = resolvePiModel({ @@ -274,6 +303,8 @@ export function createPiAgentAdapter( case `message_start`: { stepStartTime = Date.now() textStarted = false + reasoningStarted = false + reasoningAccum = `` bridge.onStepStart({ modelProvider: model.provider, modelId: model.id, @@ -293,6 +324,42 @@ export function createPiAgentAdapter( } bridge.onTextDelta(assistantEvent.delta ?? ``) textDeltaCount++ + } else if (assistantEvent?.type === `thinking_start`) { + // Open a reasoning row even if no delta arrives — some + // providers emit an empty thinking block (e.g. when + // reasoning is gated to a level the model didn't use). + // We close it on `thinking_end` regardless. + if (!reasoningStarted) { + reasoningStarted = true + reasoningAccum = `` + bridge.onReasoningStart() + } + } else if (assistantEvent?.type === `thinking_delta`) { + // Defensive: providers occasionally emit the first + // delta without a matching `thinking_start`. Open the + // row lazily so we don't drop the chunk. + if (!reasoningStarted) { + reasoningStarted = true + reasoningAccum = `` + bridge.onReasoningStart() + } + const delta = assistantEvent.delta ?? `` + reasoningAccum += delta + bridge.onReasoningDelta(delta) + } else if (assistantEvent?.type === `thinking_end`) { + if (reasoningStarted) { + // Parse a bolded `**Title**\n\n` prefix once, here, + // so the UI can drive a heading without re-parsing on + // every render. Only OpenAI's Responses API emits + // these today (Anthropic / DeepSeek don't); the + // helper returns no title for un-titled streams. + const { title } = parseReasoningSummary(reasoningAccum) + bridge.onReasoningEnd( + title !== null ? { summaryTitle: title } : undefined + ) + reasoningStarted = false + reasoningAccum = `` + } } else { runtimeLog.debug( logPrefix, @@ -339,6 +406,19 @@ export function createPiAgentAdapter( bridge.onTextEnd() textStarted = false } + if (reasoningStarted) { + // Provider closed the message without an explicit + // `thinking_end` (rare, but seen on aborts / errors). + // Close the open reasoning row with whatever title we + // can salvage from the accumulator so it doesn't sit + // forever in `streaming` state. + const { title } = parseReasoningSummary(reasoningAccum) + bridge.onReasoningEnd( + title !== null ? { summaryTitle: title } : undefined + ) + reasoningStarted = false + reasoningAccum = `` + } const usage = msg?.usage const hasToolCalls = msg?.content?.some( diff --git a/packages/agents-runtime/test/entity-timeline.test.ts b/packages/agents-runtime/test/entity-timeline.test.ts index 6528346bbc..34cce5aa78 100644 --- a/packages/agents-runtime/test/entity-timeline.test.ts +++ b/packages/agents-runtime/test/entity-timeline.test.ts @@ -1606,6 +1606,11 @@ describe(`entity includes query`, () => { ) const manifests = createSyncCollection(`test-manifests`, takeOffset) const childStatus = createSyncCollection(`test-child-status`, takeOffset) + const reasoning = createSyncCollection(`test-reasoning`, takeOffset) + const reasoningDeltas = createSyncCollection( + `test-reasoningDeltas`, + takeOffset + ) return { collections: { runs: runs.collection, @@ -1621,6 +1626,8 @@ describe(`entity includes query`, () => { contextRemoved: contextRemoved.collection, manifests: manifests.collection, childStatus: childStatus.collection, + reasoning: reasoning.collection, + reasoningDeltas: reasoningDeltas.collection, }, sync: { runs: withSeqInjection(runs, takeSeq), @@ -1636,6 +1643,8 @@ describe(`entity includes query`, () => { contextRemoved: withSeqInjection(contextRemoved, takeSeq), manifests: withSeqInjection(manifests, takeSeq), childStatus: withSeqInjection(childStatus, takeSeq), + reasoning: withSeqInjection(reasoning, takeSeq), + reasoningDeltas: withSeqInjection(reasoningDeltas, takeSeq), }, } } @@ -2293,6 +2302,343 @@ describe(`entity includes query`, () => { expect(liveEntity?.status).toBeUndefined() }) + function createTimelineCollections() { + let nextOffset = 1 + let nextSeq = 1 + const takeOffset = () => offset(nextOffset++) + const takeSeq = () => nextSeq++ + const runs = createSyncCollection(`tl-runs`, takeOffset) + const texts = createSyncCollection(`tl-texts`, takeOffset) + const textDeltas = createSyncCollection(`tl-textDeltas`, takeOffset) + const toolCalls = createSyncCollection(`tl-toolCalls`, takeOffset) + const steps = createSyncCollection(`tl-steps`, takeOffset) + const errors = createSyncCollection(`tl-errors`, takeOffset) + const inbox = createSyncCollection(`tl-inbox`, takeOffset) + const wakes = createSyncCollection(`tl-wakes`, takeOffset) + const signals = createSyncCollection(`tl-signals`, takeOffset) + const contextInserted = createSyncCollection( + `tl-context-inserted`, + takeOffset + ) + const contextRemoved = createSyncCollection( + `tl-context-removed`, + takeOffset + ) + const manifests = createSyncCollection(`tl-manifests`, takeOffset) + const childStatus = createSyncCollection(`tl-child-status`, takeOffset) + const reasoning = createSyncCollection(`tl-reasoning`, takeOffset) + const reasoningDeltas = createSyncCollection( + `tl-reasoningDeltas`, + takeOffset + ) + return { + collections: { + runs: runs.collection, + texts: texts.collection, + textDeltas: textDeltas.collection, + toolCalls: toolCalls.collection, + steps: steps.collection, + errors: errors.collection, + inbox: inbox.collection, + wakes: wakes.collection, + signals: signals.collection, + contextInserted: contextInserted.collection, + contextRemoved: contextRemoved.collection, + manifests: manifests.collection, + childStatus: childStatus.collection, + reasoning: reasoning.collection, + reasoningDeltas: reasoningDeltas.collection, + }, + sync: { + runs: withSeqInjection(runs, takeSeq), + texts: withSeqInjection(texts, takeSeq), + textDeltas: withSeqInjection(textDeltas, takeSeq), + toolCalls: withSeqInjection(toolCalls, takeSeq), + steps: withSeqInjection(steps, takeSeq), + errors: withSeqInjection(errors, takeSeq), + inbox: withSeqInjection(inbox, takeSeq), + wakes: withSeqInjection(wakes, takeSeq), + signals: withSeqInjection(signals, takeSeq), + contextInserted: withSeqInjection(contextInserted, takeSeq), + contextRemoved: withSeqInjection(contextRemoved, takeSeq), + manifests: withSeqInjection(manifests, takeSeq), + childStatus: withSeqInjection(childStatus, takeSeq), + reasoning: withSeqInjection(reasoning, takeSeq), + reasoningDeltas: withSeqInjection(reasoningDeltas, takeSeq), + }, + } + } + + function getRows(liveQuery: any): Array { + return Array.from(liveQuery.entries()).map(([, v]: any) => v) + } + + it(`live items.text.content streams in even alongside reasoning (alias-collision regression)`, async () => { + // Regression: the text-content correlated sub-query inside + // `items.select(...)` and the reasoning-content sub-query both + // used `chunk` as their `from({...})` alias. The collision broke + // the items text-content join silently — `content` came back as + // an empty string even though the deltas were in the local DB. + // The fix is to use distinct aliases (`textChunk` vs `chunk`). + const { collections, sync } = createTimelineCollections() + const liveQuery = createLiveQueryCollection({ + query: createEntityTimelineQuery({ collections } as any), + startSync: true, + }) + await liveQuery.preload() + + sync.runs.insert({ key: `run-0`, status: `started` }) + sync.texts.insert({ + key: `msg-0`, + run_id: `run-0`, + status: `streaming`, + }) + sync.textDeltas.insert({ + key: `msg-0:0`, + text_id: `msg-0`, + run_id: `run-0`, + delta: `Hello`, + }) + sync.textDeltas.insert({ + key: `msg-0:1`, + text_id: `msg-0`, + run_id: `run-0`, + delta: ` world`, + }) + // Insert a reasoning row alongside the text row so the items + // text-content sub-query and the reasoning sub-query are both + // active in the same live projection — that's the configuration + // that surfaced the collision. + sync.reasoning.insert({ + key: `reasoning-0`, + run_id: `run-0`, + status: `streaming`, + }) + sync.texts.update({ + key: `msg-0`, + run_id: `run-0`, + status: `completed`, + }) + sync.runs.update({ + key: `run-0`, + status: `completed`, + finish_reason: `stop`, + }) + await new Promise((r) => setTimeout(r, 50)) + + const rows = getRows(liveQuery) + const runRow = rows.find((r) => r.run?.key === `run-0`) + expect(runRow).toBeTruthy() + const items = Array.from(runRow.run.items.toArray) as Array + expect(items).toHaveLength(1) + const item = items[0] + expect(item.text?.key).toBe(`msg-0`) + expect(item.text?.content).toBe(`Hello world`) + }) + + it(`reasoning content survives multiple run-row updates in sequence`, async () => { + // Even closer to production: the run row gets updated MULTIPLE + // times (each delta + status flip), which may invalidate the + // child sub-collection between evaluations. + const { collections, sync } = createTimelineCollections() + const liveQuery = createLiveQueryCollection({ + query: createEntityTimelineQuery({ collections } as any), + startSync: true, + }) + await liveQuery.preload() + + sync.runs.insert({ key: `run-0`, status: `started` }) + sync.reasoning.insert({ + key: `reasoning-0`, + run_id: `run-0`, + status: `streaming`, + }) + sync.reasoningDeltas.insert({ + key: `reasoning-0:0`, + reasoning_id: `reasoning-0`, + run_id: `run-0`, + delta: `A`, + }) + sync.reasoningDeltas.insert({ + key: `reasoning-0:1`, + reasoning_id: `reasoning-0`, + run_id: `run-0`, + delta: `B`, + }) + sync.reasoning.update({ + key: `reasoning-0`, + run_id: `run-0`, + status: `completed`, + }) + // Then several text rows / deltas (each triggers run updates + // through derived projections). + sync.texts.insert({ + key: `msg-0`, + run_id: `run-0`, + status: `streaming`, + }) + for (let i = 0; i < 5; i++) { + sync.textDeltas.insert({ + key: `msg-0:${i}`, + text_id: `msg-0`, + run_id: `run-0`, + delta: `t${i}`, + }) + } + sync.texts.update({ + key: `msg-0`, + run_id: `run-0`, + status: `completed`, + }) + // Finally the run row update — the moment the bug surfaces. + sync.runs.update({ + key: `run-0`, + status: `completed`, + finish_reason: `stop`, + }) + await new Promise((r) => setTimeout(r, 100)) + + const rows = getRows(liveQuery) + const runRow = rows.find((r) => r.run?.key === `run-0`) + expect(runRow).toBeTruthy() + const reasoning = Array.from(runRow.run.reasoning.toArray) as Array + expect(reasoning).toHaveLength(1) + expect(reasoning[0].body?.content).toBe(`AB`) + }) + + it(`reasoning content populates even when text deltas are also present`, async () => { + // Production scenario: a run has BOTH text deltas and reasoning + // deltas. The reasoning sub-query was returning `content: null` + // in the running app even though the deltas were in the local DB. + const { collections, sync } = createTimelineCollections() + const liveQuery = createLiveQueryCollection({ + query: createEntityTimelineQuery({ collections } as any), + startSync: true, + }) + await liveQuery.preload() + + sync.runs.insert({ key: `run-0`, status: `started` }) + sync.reasoning.insert({ + key: `reasoning-0`, + run_id: `run-0`, + status: `streaming`, + }) + sync.reasoningDeltas.insert({ + key: `reasoning-0:0`, + reasoning_id: `reasoning-0`, + run_id: `run-0`, + delta: `Thinking part 1. `, + }) + sync.reasoningDeltas.insert({ + key: `reasoning-0:1`, + reasoning_id: `reasoning-0`, + run_id: `run-0`, + delta: `Thinking part 2.`, + }) + sync.reasoning.update({ + key: `reasoning-0`, + run_id: `run-0`, + status: `completed`, + }) + sync.texts.insert({ + key: `msg-0`, + run_id: `run-0`, + status: `streaming`, + }) + sync.textDeltas.insert({ + key: `msg-0:0`, + text_id: `msg-0`, + run_id: `run-0`, + delta: `Answer part 1. `, + }) + sync.textDeltas.insert({ + key: `msg-0:1`, + text_id: `msg-0`, + run_id: `run-0`, + delta: `Answer part 2.`, + }) + sync.texts.update({ + key: `msg-0`, + run_id: `run-0`, + status: `completed`, + }) + sync.runs.update({ + key: `run-0`, + status: `completed`, + finish_reason: `stop`, + }) + await new Promise((r) => setTimeout(r, 50)) + + const rows = getRows(liveQuery) + const runRow = rows.find((r) => r.run?.key === `run-0`) + expect(runRow).toBeTruthy() + const reasoning = Array.from(runRow.run.reasoning.toArray) as Array + expect(reasoning).toHaveLength(1) + expect(reasoning[0].body?.content).toBe( + `Thinking part 1. Thinking part 2.` + ) + const items = Array.from(runRow.run.items.toArray) as Array + expect(items).toHaveLength(1) + expect(items[0].text?.content).toBe(`Answer part 1. Answer part 2.`) + }) + + it(`reasoning content remains populated after status flips to completed`, async () => { + // Reproduces the bug where the reasoning row's `content` field + // came back as `undefined` (not even `""`) once the row's status + // transitioned to `completed`, even though the deltas were still + // present in the local DB. This made the "Thought for Ns" + // expanded view render an empty body. + const { collections, sync } = createTimelineCollections() + const liveQuery = createLiveQueryCollection({ + query: createEntityTimelineQuery({ collections } as any), + startSync: true, + }) + await liveQuery.preload() + + sync.runs.insert({ key: `run-0`, status: `started` }) + sync.reasoning.insert({ + key: `reasoning-0`, + run_id: `run-0`, + status: `streaming`, + }) + sync.reasoningDeltas.insert({ + key: `reasoning-0:0`, + reasoning_id: `reasoning-0`, + run_id: `run-0`, + delta: `First thinking step. `, + }) + sync.reasoningDeltas.insert({ + key: `reasoning-0:1`, + reasoning_id: `reasoning-0`, + run_id: `run-0`, + delta: `Second thinking step.`, + }) + // Now flip the row to completed — this is the transition that + // caused content to vanish in the running app. + sync.reasoning.update({ + key: `reasoning-0`, + run_id: `run-0`, + status: `completed`, + }) + sync.runs.update({ + key: `run-0`, + status: `completed`, + finish_reason: `stop`, + }) + await new Promise((r) => setTimeout(r, 50)) + + const rows = getRows(liveQuery) + const runRow = rows.find((r) => r.run?.key === `run-0`) + expect(runRow).toBeTruthy() + const reasoning = Array.from(runRow.run.reasoning.toArray) as Array + expect(reasoning).toHaveLength(1) + expect(reasoning[0].key).toBe(`reasoning-0`) + expect(reasoning[0].status).toBe(`completed`) + expect(reasoning[0].body?.content).toBe( + `First thinking step. Second thinking step.` + ) + }) + it(`aggregates per-run token totals from steps`, async () => { const { collections, sync } = createEntityCollections() const queryFn = createEntityIncludesQuery({ collections } as any) diff --git a/packages/agents-runtime/test/outbound-bridge.test.ts b/packages/agents-runtime/test/outbound-bridge.test.ts index 96c5f28e43..62fb5d75b3 100644 --- a/packages/agents-runtime/test/outbound-bridge.test.ts +++ b/packages/agents-runtime/test/outbound-bridge.test.ts @@ -202,7 +202,7 @@ describe(`createOutboundBridge`, () => { it(`uses a preloaded ID seed for later reruns`, () => { const writes: Array = [] const bridge = createOutboundBridge( - { run: 2, step: 4, msg: 3, tc: 5 }, + { run: 2, step: 4, msg: 3, tc: 5, reasoning: 0 }, (event) => { writes.push(event) } diff --git a/packages/agents-runtime/test/pi-adapter.test.ts b/packages/agents-runtime/test/pi-adapter.test.ts index 14ae6f7784..28846d8ace 100644 --- a/packages/agents-runtime/test/pi-adapter.test.ts +++ b/packages/agents-runtime/test/pi-adapter.test.ts @@ -44,7 +44,7 @@ describe(`createPiAgentAdapter`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (_event: ChangeEvent) => {}, } @@ -113,7 +113,7 @@ describe(`createPiAgentAdapter`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (_event: ChangeEvent) => {}, }) const controller = new AbortController() @@ -141,7 +141,7 @@ describe(`createPiAgentAdapter`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (_event: ChangeEvent) => {}, }) const controller = new AbortController() @@ -205,7 +205,7 @@ describe(`createPiAgentAdapter`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (event: ChangeEvent) => { events.push(event) }, @@ -252,7 +252,7 @@ describe(`createPiAgentAdapter`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (_event: ChangeEvent) => {}, } @@ -271,7 +271,7 @@ describe(`createPiAgentAdapter`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (_event: ChangeEvent) => {}, } @@ -587,7 +587,7 @@ describe(`toAgentHistory`, () => { entityUrl: `test/entity-1`, epoch: 1, messages: [], - outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0 }, + outboundIdSeed: { run: 0, step: 0, msg: 0, tc: 0, reasoning: 0 }, writeEvent: (e: ChangeEvent) => { events.push(e) }, diff --git a/packages/agents-runtime/test/process-wake.test.ts b/packages/agents-runtime/test/process-wake.test.ts index e2ccf280dc..cd1c0a031b 100644 --- a/packages/agents-runtime/test/process-wake.test.ts +++ b/packages/agents-runtime/test/process-wake.test.ts @@ -171,6 +171,12 @@ vi.mock(`../src/entity-stream-db`, () => ({ const textDeltas = createLocalOnlyTestCollection>( [] ) + const reasoning = createLocalOnlyTestCollection>( + [] + ) + const reasoningDeltas = createLocalOnlyTestCollection< + Record + >([]) const toolCalls = createLocalOnlyTestCollection>( [] ) @@ -311,6 +317,8 @@ vi.mock(`../src/entity-stream-db`, () => ({ runs, texts, textDeltas, + reasoning, + reasoningDeltas, toolCalls, steps, manifests, diff --git a/packages/agents-server-ui/src/components/AgentResponse.tsx b/packages/agents-server-ui/src/components/AgentResponse.tsx index a98dc51b21..13f26f12c5 100644 --- a/packages/agents-server-ui/src/components/AgentResponse.tsx +++ b/packages/agents-server-ui/src/components/AgentResponse.tsx @@ -1,6 +1,7 @@ import { Check, Copy, GitFork } from 'lucide-react' import { memo, + useCallback, useEffect, useLayoutEffect, useMemo, @@ -26,7 +27,9 @@ import { ToolCallView } from './ToolCallView' import { TimeText } from './TimeText' import { ThinkingIndicator } from './ThinkingIndicator' import { ElapsedTime } from './ElapsedTime' +import { ReasoningBlock, type ReasoningEntry } from './ReasoningSection' import { TokenUsage } from './TokenUsage' + import { formatElapsedDuration, toMillis } from '../lib/formatTime' import styles from './AgentResponse.module.css' import type { ForkFromHereAction } from './UserMessage' @@ -303,6 +306,42 @@ function compareLiveRunItems( return runItemKey(left).localeCompare(runItemKey(right)) } +/** + * One renderable element of a live run — either a text/tool-call item + * or a reasoning block — tagged with its stream order so the two + * streams can be interleaved at the positions they were emitted + * (think → write → call tool → think → write …). + */ +type LiveRenderEntry = + | { + kind: `item` + key: string + order: string | number + item: EntityTimelineRunItem + } + | { + kind: `reasoning` + key: string + order: string | number + reasoning: ReasoningEntry + } + +function compareLiveRenderEntries( + left: LiveRenderEntry, + right: LiveRenderEntry +): number { + const orderCompare = compareTimelineOrderValues(left.order, right.order) + if (orderCompare !== 0) return orderCompare + if (left.kind === `item` && right.kind === `item`) { + return compareLiveRunItems(left.item, right.item) + } + // At equal order, reasoning precedes output — the model thinks, + // then writes. Mostly matters for legacy rows that predate + // `_timeline_order` and all coalesce to the same sentinel. + if (left.kind !== right.kind) return left.kind === `reasoning` ? -1 : 1 + return left.key.localeCompare(right.key) +} + function liveRunItemsToContentItems( items: Array ): Array { @@ -404,6 +443,47 @@ export const AgentResponseLive = memo(function AgentResponseLive({ (q) => (run.errors ? q.from({ error: run.errors }) : undefined), [run.errors] ) + // Subscribe to the run's reasoning rows so the section ticks as + // each `reasoning_delta` arrives. Empty array for runs without + // any reasoning content (most non-extended-thinking models). + const { data: reasoningRows = [] } = useLiveQuery( + (q) => (run.reasoning ? q.from({ reasoning: run.reasoning }) : undefined), + [run.reasoning] + ) + const reasoningEntries = useMemo>( + () => + ( + reasoningRows as Array<{ + key: string + status: `streaming` | `completed` + body?: { content?: string } + summary_title?: string + encrypted?: string + order?: string | number + }> + ) + .map((row) => ({ + key: row.key, + order: row.order ?? `~`, + status: row.status, + summary_title: row.summary_title, + encrypted: row.encrypted, + // The projection in `entity-timeline.ts` wraps content under + // `body` (inside a caseWhen) to force include materialization. + // See the comment there. + content: row.body?.content ?? ``, + })) + // Drop rows with nothing to show. The bridge opens a reasoning + // row on `thinking_start` even when no delta ever arrives — + // some providers (e.g. OpenAI codex models) report that the + // model reasoned but never expose the tokens — and an empty + // "Thought" block is pure noise. Encrypted rows stay: they're + // Anthropic redacted thinking, rendered as a placeholder. A + // row that is still streaming appears as soon as its first + // delta lands. + .filter((entry) => entry.content.trim().length > 0 || entry.encrypted), + [reasoningRows] + ) // Token totals are aggregated in the query layer // (`createEntityTimelineQuery`) — see the `runTokensSource` // leftJoin in `entity-timeline.ts`. The query sums each step's @@ -420,10 +500,44 @@ export const AgentResponseLive = memo(function AgentResponseLive({ if (input === undefined && output === undefined) return null return { input, output } }, [run.tokens]) + const sortedItems = useMemo( () => [...items].sort(compareLiveRunItems), [items] ) + // Interleave reasoning blocks with the run's items by stream order + // so each block renders where the model emitted it — before the + // step's text / tool calls, not lumped above the whole response. + const renderEntries = useMemo>( + () => + [ + ...sortedItems.map((item) => ({ + kind: `item`, + key: item.$key, + order: item.text?.order ?? item.toolCall?.order ?? `~`, + item, + })), + ...reasoningEntries.map((reasoning) => ({ + kind: `reasoning`, + key: reasoning.key, + order: reasoning.order, + reasoning, + })), + ].sort(compareLiveRenderEntries), + [sortedItems, reasoningEntries] + ) + // Expand/collapse state for settled reasoning blocks, keyed by row + // key. Owned here rather than inside `ReasoningBlock` so the user's + // choice survives the block being unmounted and remounted — e.g. + // when the reasoning row briefly disappears from the live query + // while another part of the run updates, or when a virtualizer + // measurement pass replaces the subtree. + const [expandedReasoning, setExpandedReasoning] = useState< + Record + >({}) + const toggleReasoning = useCallback((key: string) => { + setExpandedReasoning((prev) => ({ ...prev, [key]: !prev[key] })) + }, []) const contentItems = useMemo( () => liveRunItemsToContentItems(sortedItems), [sortedItems] @@ -498,13 +612,27 @@ export const AgentResponseLive = memo(function AgentResponseLive({ return ( - {sortedItems.map((item, i) => { + {renderEntries.map((entry) => { + if (entry.kind === `reasoning`) { + return ( + + ) + } + + const item = entry.item if (item.text) { return ( ) diff --git a/packages/agents-server-ui/src/components/ReasoningSection.module.css b/packages/agents-server-ui/src/components/ReasoningSection.module.css new file mode 100644 index 0000000000..c886acd49d --- /dev/null +++ b/packages/agents-server-ui/src/components/ReasoningSection.module.css @@ -0,0 +1,81 @@ +/* Reasoning blocks interleave with the agent's text / tool-call items + * at the stream position they were emitted. We want them to read as + * secondary content — never compete with the response — but stay + * legible enough that a curious user can skim them. + * + * Visual hierarchy: + * live → faded markdown body, animated "Thinking" heading + * settled → single muted line, click-to-expand + * redacted → single muted line, no expand */ + +.live { + border-left: 2px solid var(--ds-border-2); + padding-left: 10px; +} + +.header { + padding-bottom: 4px; +} + +.separator { + color: var(--ds-text-4); + opacity: 0.7; +} + +.title { + color: var(--ds-text-3); +} + +/* The reasoning body is rendered with `Streamdown` but at reduced + * weight so it reads as supporting material. `opacity` (rather than + * a different `color`) keeps inline code / links / emphasis tinted + * proportionally instead of forcing every span flat-muted. */ +.body { + opacity: 0.7; + font-size: 0.95em; +} + +/* Settled collapsed row. Click target spans the whole header. */ +.settled { + margin-block: 4px; +} + +.toggle { + /* Reset native button styling — visually it's just a muted line. */ + background: none; + border: none; + padding: 2px 0; + cursor: pointer; + text-align: left; + color: inherit; + font: inherit; +} + +.toggle:hover { + opacity: 1; +} + +.chevron { + display: inline-block; + width: 0.8em; + text-align: center; + color: var(--ds-text-4); + opacity: 0.7; + /* Tabular-style alignment so the chevron doesn't shift the trailing + * label between collapsed and expanded states (▸ and ▾ render at + * slightly different glyph widths in most fonts). */ + font-variant-numeric: tabular-nums; +} + +.expandedBody { + border-left: 2px solid var(--ds-border-2); + padding-left: 10px; + margin-top: 4px; + opacity: 0.7; + font-size: 0.95em; +} + +.redacted { + padding: 4px 0; + opacity: 0.6; +} diff --git a/packages/agents-server-ui/src/components/ReasoningSection.tsx b/packages/agents-server-ui/src/components/ReasoningSection.tsx new file mode 100644 index 0000000000..35ec725f31 --- /dev/null +++ b/packages/agents-server-ui/src/components/ReasoningSection.tsx @@ -0,0 +1,192 @@ +import { useEffect, useMemo, useRef, useState } from 'react' +import { Streamdown } from 'streamdown' +import { + streamdownComponents, + streamdownControls, + streamdownPlugins, +} from '../lib/streamdownConfig' +import { Stack, Text } from '../ui' +import { ThinkingIndicator } from './ThinkingIndicator' +import { ElapsedTime } from './ElapsedTime' +import { formatElapsedDuration, toMillis } from '../lib/formatTime' +import styles from './ReasoningSection.module.css' + +/** + * One reasoning row's worth of UI state — what the live query gives us + * for each row in `run.reasoning`. Mirrors `EntityTimelineReasoningItem` + * but pulled into a local type so the component file doesn't import + * from agents-runtime/client (keeps this file dep-light for the desktop + * + mobile embeds). + */ +export type ReasoningEntry = { + key: string + // Stream position of the reasoning row — same `_timeline_order` + // space as the run's text / tool-call items, so the parent can + // interleave reasoning blocks at the position they were emitted. + order: string | number + content: string + status: `streaming` | `completed` + summary_title?: string + encrypted?: string +} + +/** + * Renders the model's extended-thinking / reasoning content above the + * agent's visible response. Visual treatment intentionally mirrors + * Claude Code + OpenCode: + * + * - **While streaming**: faded markdown body with the `ThinkingIndicator` + * shimmer + the parsed `summary_title` (if any) as the heading. The + * elapsed-time ticker rides alongside so the user sees the model is + * actively chewing on the problem. + * - **Once settled**: collapses to a single-line `▸ Thought for 12s` + * row that the user can click to expand. Collapsed-by-default is the + * established pattern (OpenCode defaults to `hide` — reasoning is + * noise unless you're debugging). + * - **Anthropic redacted blocks** (`encrypted` set, no `content`): the + * provider has hidden the content behind a safety filter. We can't + * show anything meaningful, so render a single-line affordance and + * move on. The encrypted payload is still persisted server-side so + * the model gets it back on the next turn. + * + * Multiple reasoning rows per run are possible — typically one per LLM + * step in a tool-using turn — so the parent renders one block per row, + * interleaved with the run's text / tool-call items by stream order. + * + * Expand/collapse state is controlled by the parent (keyed by + * `entry.key`) rather than owned here, so the user's choice survives + * this block being unmounted and remounted — e.g. when the reasoning + * row briefly disappears from the live query while another part of + * the run updates, or when a virtualizer measurement pass replaces + * the subtree. + */ +export function ReasoningBlock({ + entry, + isStreaming, + timestamp, + expanded, + onToggle, +}: { + entry: ReasoningEntry + isStreaming: boolean + timestamp?: number | null + expanded: boolean + onToggle: (key: string) => void +}): React.ReactElement { + const isLive = isStreaming && entry.status === `streaming` + const handleToggle = useMemo( + () => () => onToggle(entry.key), + [entry.key, onToggle] + ) + + // Snapshot the elapsed duration at the moment streaming flips to + // `completed`, the same `sawStreamingRef` trick used for "done in + // Xs" on `AgentResponse`. For reasoning rows that were already + // settled on first mount (page reload, scrollback into older + // turns) we don't have a real end timestamp, so the closure stays + // a bare "Thought" without a duration — better than printing a + // wildly-wrong number from `now() - userMessageTime`. + const sawStreamingRef = useRef(isLive) + if (isLive) sawStreamingRef.current = true + const [finalDurationMs, setFinalDurationMs] = useState(null) + useEffect(() => { + if ( + entry.status === `completed` && + sawStreamingRef.current && + timestamp != null && + finalDurationMs == null + ) { + setFinalDurationMs(Math.max(0, Date.now() - toMillis(timestamp))) + } + }, [entry.status, timestamp, finalDurationMs]) + + // Redacted thinking — opaque payload, nothing to render. + if (entry.encrypted && entry.content.trim().length === 0) { + return ( +
+ + ⊘ Reasoning redacted by provider safety filters + +
+ ) + } + + if (isLive) { + return ( +
+ + + {entry.summary_title && ( + <> + + · + + + {entry.summary_title} + + + )} + {timestamp != null && ( + <> + + · + + + + )} + +
+ + {entry.content} + +
+
+ ) + } + + // Settled. + const closureLabel = + finalDurationMs != null + ? `Thought for ${formatElapsedDuration(finalDurationMs)}` + : `Thought` + + return ( +
+ + {expanded && ( +
+ + {entry.content} + +
+ )} +
+ ) +} diff --git a/packages/agents/src/model-catalog.ts b/packages/agents/src/model-catalog.ts index e1b74cc5d5..fc1ea1d1db 100644 --- a/packages/agents/src/model-catalog.ts +++ b/packages/agents/src/model-catalog.ts @@ -238,47 +238,101 @@ function filterChoicesByEnabledModels( return filtered.length > 0 ? filtered : choices } +/** + * Anthropic-specific budget mapping for `reasoningEffort`. + * + * Anthropic's `thinking.budget_tokens` is a hard cap on tokens spent + * inside the thinking block before the model must commit to its + * answer. Docs require ≥ 1024; we scale from there. Numbers tuned so + * `medium` is the spot most "show your work" requests land, and + * `high` covers tougher reasoning without uncapped spend. + * + * Keep in sync with provider doc updates — Anthropic has shifted the + * minimum once already (older models capped lower). + */ +const ANTHROPIC_THINKING_BUDGET_BY_EFFORT: Record< + ExplicitReasoningEffort, + number +> = { + minimal: 1024, + low: 2048, + medium: 8192, + high: 24576, +} + function withProviderPayloadDefaults( config: PersistedModelConfig & { getApiKey?: AgentConfig[`getApiKey`] }, choice: BuiltinModelChoice, reasoningEffort: ExplicitReasoningEffort | null ): BuiltinAgentModelConfig { - if ( - (choice.provider !== `openai` && choice.provider !== `openai-codex`) || - !choice.reasoning - ) - return config - - const defaultEffort = choice.provider === `openai-codex` ? `low` : `minimal` - const effort = - reasoningEffort === `minimal` && choice.provider === `openai-codex` - ? `low` - : (reasoningEffort ?? defaultEffort) + if (!choice.reasoning) return config + + if (choice.provider === `openai` || choice.provider === `openai-codex`) { + const defaultEffort = choice.provider === `openai-codex` ? `low` : `minimal` + const effort = + reasoningEffort === `minimal` && choice.provider === `openai-codex` + ? `low` + : (reasoningEffort ?? defaultEffort) + + return { + ...config, + onPayload: (payload) => { + if (typeof payload !== `object` || payload === null) return undefined + const body = payload as Record + const existingReasoning = + typeof body.reasoning === `object` && body.reasoning !== null + ? (body.reasoning as Record) + : {} + + return { + ...body, + // OpenAI Responses reasoning/tool-call continuations replay rs_* + // reasoning items. With store:false, OpenAI does not persist those + // items server-side, which can make follow-up requests fail with + // "Item with id ... not found". Keep Responses stateful for built-ins. + store: true, + reasoning: { + ...existingReasoning, + effort, + }, + } + }, + } + } - return { - ...config, - onPayload: (payload) => { - if (typeof payload !== `object` || payload === null) return undefined - const body = payload as Record - const existingReasoning = - typeof body.reasoning === `object` && body.reasoning !== null - ? (body.reasoning as Record) - : {} - - return { - ...body, - // OpenAI Responses reasoning/tool-call continuations replay rs_* - // reasoning items. With store:false, OpenAI does not persist those - // items server-side, which can make follow-up requests fail with - // "Item with id ... not found". Keep Responses stateful for built-ins. - store: true, - reasoning: { - ...existingReasoning, - effort, - }, - } - }, + if (choice.provider === `anthropic`) { + // `auto` maps to the minimal budget so extended thinking is always + // on for reasoning-capable Anthropic models, matching the OpenAI + // branch above (where `auto` falls through to a `minimal` default). + const effectiveEffort = reasoningEffort ?? `minimal` + const budgetTokens = ANTHROPIC_THINKING_BUDGET_BY_EFFORT[effectiveEffort] + + return { + ...config, + onPayload: (payload) => { + if (typeof payload !== `object` || payload === null) return undefined + const body = payload as Record + // pi-ai writes `thinking: { type: "disabled" }` into the payload + // by default. Merge our enabled-thinking values last so they win + // — otherwise the API rejects `budget_tokens` for a disabled + // `thinking` block. + const existingThinking = + typeof body.thinking === `object` && body.thinking !== null + ? (body.thinking as Record) + : {} + return { + ...body, + thinking: { + ...existingThinking, + type: `enabled`, + budget_tokens: budgetTokens, + }, + } + }, + } } + + return config } function parseReasoningEffort(value: unknown): ExplicitReasoningEffort | null { diff --git a/packages/agents/test/model-catalog.test.ts b/packages/agents/test/model-catalog.test.ts index ec1f3f9a57..f5fe518329 100644 --- a/packages/agents/test/model-catalog.test.ts +++ b/packages/agents/test/model-catalog.test.ts @@ -151,6 +151,88 @@ describe(`model catalog`, () => { }) }) + it(`enables Anthropic extended thinking with a minimal budget when reasoningEffort is auto`, async () => { + process.env.ANTHROPIC_API_KEY = `test-anthropic-key` + vi.stubGlobal( + `fetch`, + vi.fn(async (url: string) => { + if (String(url).includes(`api.anthropic.com`)) { + return { + ok: true, + status: 200, + json: async () => ({ data: [{ id: `claude-sonnet-4-6` }] }), + } + } + return { ok: false, status: 401, json: async () => ({}) } + }) + ) + + const catalog = await createBuiltinModelCatalog() + const config = resolveBuiltinModelConfig(catalog!, { + model: `anthropic:claude-sonnet-4-6`, + }) + + expect(config.onPayload).toBeTypeOf(`function`) + expect(config.onPayload!({}, {} as any)).toEqual({ + thinking: { type: `enabled`, budget_tokens: 1024 }, + }) + }) + + it(`overrides a pre-existing thinking.type=disabled in the Anthropic payload`, async () => { + process.env.ANTHROPIC_API_KEY = `test-anthropic-key` + vi.stubGlobal( + `fetch`, + vi.fn(async (url: string) => { + if (String(url).includes(`api.anthropic.com`)) { + return { + ok: true, + status: 200, + json: async () => ({ data: [{ id: `claude-sonnet-4-6` }] }), + } + } + return { ok: false, status: 401, json: async () => ({}) } + }) + ) + + const catalog = await createBuiltinModelCatalog() + const config = resolveBuiltinModelConfig(catalog!, { + model: `anthropic:claude-sonnet-4-6`, + }) + + expect( + config.onPayload!({ thinking: { type: `disabled` } }, {} as any) + ).toEqual({ + thinking: { type: `enabled`, budget_tokens: 1024 }, + }) + }) + + it(`scales Anthropic thinking budget with explicit reasoningEffort`, async () => { + process.env.ANTHROPIC_API_KEY = `test-anthropic-key` + vi.stubGlobal( + `fetch`, + vi.fn(async (url: string) => { + if (String(url).includes(`api.anthropic.com`)) { + return { + ok: true, + status: 200, + json: async () => ({ data: [{ id: `claude-sonnet-4-6` }] }), + } + } + return { ok: false, status: 401, json: async () => ({}) } + }) + ) + + const catalog = await createBuiltinModelCatalog() + const config = resolveBuiltinModelConfig(catalog!, { + model: `anthropic:claude-sonnet-4-6`, + reasoningEffort: `high`, + }) + + expect(config.onPayload!({}, {} as any)).toEqual({ + thinking: { type: `enabled`, budget_tokens: 24576 }, + }) + }) + it(`forces store true only for OpenAI reasoning model payloads`, async () => { const openAiCatalog = await createBuiltinModelCatalog() const openAiConfig = resolveBuiltinModelConfig(openAiCatalog!, {