From d7a37984d184c471262a730e30fb8742aa0954c9 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 11:23:56 +0100 Subject: [PATCH 01/31] feat(agents): add realtime stream foundations --- packages/agents-runtime/src/entity-schema.ts | 228 ++++++++++++++++++- packages/agents-runtime/src/index.ts | 12 + packages/agents-runtime/src/types.ts | 12 + packages/agents-server/src/index.ts | 3 + packages/agents-server/src/stream-client.ts | 35 ++- 5 files changed, 281 insertions(+), 9 deletions(-) diff --git a/packages/agents-runtime/src/entity-schema.ts b/packages/agents-runtime/src/entity-schema.ts index 7d70d3cef2..0762ca2f70 100644 --- a/packages/agents-runtime/src/entity-schema.ts +++ b/packages/agents-runtime/src/entity-schema.ts @@ -347,6 +347,85 @@ type ManifestFutureSendScheduleEntryValue = { failedAt?: string lastError?: string } +type RealtimeSessionStatusValue = + | `requested` + | `active` + | `closing` + | `closed` + | `failed` +type RealtimeSessionStreamRefsValue = { + audio_in: string + audio_out: string + control_in: string + control_out: string +} +type ManifestRealtimeSessionEntryValue = { + key?: string + kind: `realtime-session` + id: string + provider: string + model: string + status: RealtimeSessionStatusValue + startedAt: string + endedAt?: string | null + streams: RealtimeSessionStreamRefsValue + retention: `forever` + meta?: Record +} +type RealtimeSessionValue = { + key?: string + session_id: string + provider: string + model: string + status: RealtimeSessionStatusValue + started_at: string + ended_at?: string + streams: RealtimeSessionStreamRefsValue + reason?: string + error?: string + meta?: Record +} +type RealtimeAudioSpanValue = { + key?: string + session_id: string + stream: `input` | `output` + producer_id: string + producer_epoch: number + seq: number + offset: string + next_offset?: string + byte_start?: number + byte_end?: number + byte_length: number + sample_start: number + sample_count: number + sample_rate: number + channels: number + codec: `pcm16` + timing_source: `client` | `runtime` | `provider` + captured_at?: string + received_at?: string + participant_id?: string + turn_id?: string + provider_item_id?: string + response_id?: string + created_at: string +} +type RealtimeTranscriptValue = { + key?: string + session_id: string + direction: `input` | `output` + text: string + status: `partial` | `final` + turn_id?: string + response_id?: string + audio_stream?: `input` | `output` + audio_offset?: string + audio_next_offset?: string + sample_start?: number + sample_end?: number + created_at: string +} type ReplayWatermarkValue = { key?: string source_id: string @@ -707,6 +786,20 @@ function createContextRemovedSchema(): Schema { timestamp: z.string(), }) } + +function createRealtimeSessionStreamRefsSchema(): Schema { + return z.object({ + audio_in: z.string(), + audio_out: z.string(), + control_in: z.string(), + control_out: z.string(), + }) +} + +function createRealtimeSessionStatusSchema() { + return z.enum([`requested`, `active`, `closing`, `closed`, `failed`]) +} + function createManifestSchema(): Schema< | ManifestChildEntryValue | ManifestSourceEntryValue @@ -716,6 +809,7 @@ function createManifestSchema(): Schema< | ManifestContextEntryValue | ManifestCronScheduleEntryValue | ManifestFutureSendScheduleEntryValue + | ManifestRealtimeSessionEntryValue > { return z.union([ z.object({ @@ -818,6 +912,20 @@ function createManifestSchema(): Schema< failedAt: z.string().optional(), lastError: z.string().optional(), }), + z.object({ + key: z.string().optional(), + ...timelineOrderField, + kind: z.literal(`realtime-session`), + id: z.string(), + provider: z.string(), + model: z.string(), + status: createRealtimeSessionStatusSchema(), + startedAt: z.string(), + endedAt: z.string().nullable().optional(), + streams: createRealtimeSessionStreamRefsSchema(), + retention: z.literal(`forever`).default(`forever`), + meta: createJsonObjectSchema().optional(), + }), ]) as unknown as Schema< | ManifestChildEntryValue | ManifestSourceEntryValue @@ -827,9 +935,76 @@ function createManifestSchema(): Schema< | ManifestContextEntryValue | ManifestCronScheduleEntryValue | ManifestFutureSendScheduleEntryValue + | ManifestRealtimeSessionEntryValue > } +function createRealtimeSessionSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + session_id: z.string(), + provider: z.string(), + model: z.string(), + status: createRealtimeSessionStatusSchema(), + started_at: z.string(), + ended_at: z.string().optional(), + streams: createRealtimeSessionStreamRefsSchema(), + reason: z.string().optional(), + error: z.string().optional(), + meta: createJsonObjectSchema().optional(), + }) +} + +function createRealtimeAudioSpanSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + session_id: z.string(), + stream: z.enum([`input`, `output`]), + producer_id: z.string(), + producer_epoch: z.number().int().nonnegative(), + seq: z.number().int().nonnegative(), + offset: z.string(), + next_offset: z.string().optional(), + byte_start: z.number().int().nonnegative().optional(), + byte_end: z.number().int().nonnegative().optional(), + byte_length: z.number().int().nonnegative(), + sample_start: z.number().int().nonnegative(), + sample_count: z.number().int().nonnegative(), + sample_rate: z.number().int().positive(), + channels: z.number().int().positive(), + codec: z.literal(`pcm16`), + timing_source: z.enum([`client`, `runtime`, `provider`]), + captured_at: z.string().optional(), + received_at: z.string().optional(), + participant_id: z.string().optional(), + turn_id: z.string().optional(), + provider_item_id: z.string().optional(), + response_id: z.string().optional(), + created_at: z.string(), + }) +} + +function createRealtimeTranscriptSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + session_id: z.string(), + direction: z.enum([`input`, `output`]), + text: z.string(), + status: z.enum([`partial`, `final`]), + turn_id: z.string().optional(), + response_id: z.string().optional(), + audio_stream: z.enum([`input`, `output`]).optional(), + audio_offset: z.string().optional(), + audio_next_offset: z.string().optional(), + sample_start: z.number().int().nonnegative().optional(), + sample_end: z.number().int().nonnegative().optional(), + created_at: z.string(), + }) +} + function createReplayWatermarkSchema(): Schema { return z.object({ key: z.string().optional(), @@ -881,6 +1056,10 @@ export type ManifestCronScheduleEntry = SequencedPersistedRow export type ManifestFutureSendScheduleEntry = SequencedPersistedRow +export type RealtimeSessionStatus = RealtimeSessionStatusValue +export type RealtimeSessionStreamRefs = RealtimeSessionStreamRefsValue +export type ManifestRealtimeSessionEntry = + SequencedPersistedRow type ManifestUnion = | ManifestChildEntry | ManifestSourceEntry @@ -890,6 +1069,7 @@ type ManifestUnion = | ManifestContextEntry | ManifestCronScheduleEntry | ManifestFutureSendScheduleEntry + | ManifestRealtimeSessionEntry export type Manifest = ManifestUnion & { id?: string entity_url?: string @@ -921,11 +1101,23 @@ export type Manifest = ManifestUnion & { targetUrl?: string producerId?: string messageType?: string - status?: FutureSendScheduleStatus | AttachmentStatusValue + status?: + | FutureSendScheduleStatus + | AttachmentStatusValue + | RealtimeSessionStatusValue sentAt?: string failedAt?: string lastError?: string -} + provider?: string + model?: string + startedAt?: string + endedAt?: string | null + streams?: RealtimeSessionStreamRefs + retention?: `forever` +} +export type RealtimeSession = SequencedPersistedRow +export type RealtimeAudioSpan = SequencedPersistedRow +export type RealtimeTranscript = SequencedPersistedRow export type ReplayWatermark = SequencedPersistedRow // ============================================================================ @@ -949,6 +1141,9 @@ export const ENTITY_COLLECTIONS = { tags: `tags`, slashCommands: `slashCommands`, manifests: `manifests`, + realtimeSessions: `realtimeSessions`, + realtimeAudioSpans: `realtimeAudioSpans`, + realtimeTranscripts: `realtimeTranscripts`, contextInserted: `contextInserted`, contextRemoved: `contextRemoved`, replayWatermarks: `replayWatermarks`, @@ -982,6 +1177,12 @@ export const BUILT_IN_EVENT_SCHEMAS = { context_removed: createContextRemovedSchema() as unknown as BuiltInEntitySchema, manifest: createManifestSchema() as unknown as BuiltInEntitySchema, + realtime_session: + createRealtimeSessionSchema() as unknown as BuiltInEntitySchema, + realtime_audio_span: + createRealtimeAudioSpanSchema() as unknown as BuiltInEntitySchema, + realtime_transcript: + createRealtimeTranscriptSchema() as unknown as BuiltInEntitySchema, replay_watermark: createReplayWatermarkSchema() as unknown as BuiltInEntitySchema, } as const @@ -1008,6 +1209,9 @@ type EntityCollectionsDefinition = { tags: CollectionDefinition slashCommands: CollectionDefinition manifests: CollectionDefinition + realtimeSessions: CollectionDefinition + realtimeAudioSpans: CollectionDefinition + realtimeTranscripts: CollectionDefinition contextInserted: CollectionDefinition contextRemoved: CollectionDefinition replayWatermarks: CollectionDefinition @@ -1104,6 +1308,24 @@ export const builtInCollections: EntityCollectionsDefinition = { type: `manifest`, primaryKey: `key`, }, + realtimeSessions: { + schema: + BUILT_IN_EVENT_SCHEMAS.realtime_session as StandardSchemaV1, + type: `realtime_session`, + primaryKey: `key`, + }, + realtimeAudioSpans: { + schema: + BUILT_IN_EVENT_SCHEMAS.realtime_audio_span as StandardSchemaV1, + type: `realtime_audio_span`, + primaryKey: `key`, + }, + realtimeTranscripts: { + schema: + BUILT_IN_EVENT_SCHEMAS.realtime_transcript as StandardSchemaV1, + type: `realtime_transcript`, + primaryKey: `key`, + }, contextInserted: { schema: BUILT_IN_EVENT_SCHEMAS.context_inserted as StandardSchemaV1, @@ -1140,6 +1362,8 @@ const MANAGEMENT_TYPES = new Set([ `entity_created`, `signal`, `manifest`, + `realtime_session`, + `realtime_audio_span`, `replay_watermark`, `ack`, ]) diff --git a/packages/agents-runtime/src/index.ts b/packages/agents-runtime/src/index.ts index 3275e31be3..e4015ee3e1 100644 --- a/packages/agents-runtime/src/index.ts +++ b/packages/agents-runtime/src/index.ts @@ -9,8 +9,14 @@ export type { ManifestContextEntry, ManifestEntry, ManifestEffectEntry, + ManifestRealtimeSessionEntry, ManifestSourceEntry, ManifestSharedStateEntry, + RealtimeAudioSpan, + RealtimeSession, + RealtimeSessionStatus, + RealtimeSessionStreamRefs, + RealtimeTranscript, PendingSend, EffectConfig, ObservationSource, @@ -113,6 +119,12 @@ export type { AttachmentSubject, AttachmentSubjectType, ManifestContextEntry as ManifestContextEntryRow, + ManifestRealtimeSessionEntry as ManifestRealtimeSessionEntryRow, + RealtimeAudioSpan as RealtimeAudioSpanRow, + RealtimeSession as RealtimeSessionRow, + RealtimeSessionStatus as RealtimeSessionStatusRow, + RealtimeSessionStreamRefs as RealtimeSessionStreamRefsRow, + RealtimeTranscript as RealtimeTranscriptRow, ReplayWatermark, WakeConfigValue, } from './entity-schema' diff --git a/packages/agents-runtime/src/types.ts b/packages/agents-runtime/src/types.ts index ec366ab670..48c3d44bc3 100644 --- a/packages/agents-runtime/src/types.ts +++ b/packages/agents-runtime/src/types.ts @@ -43,8 +43,14 @@ import type { ManifestCronScheduleEntry as EntityManifestCronScheduleEntry, ManifestEffectEntry as EntityManifestEffectEntry, ManifestFutureSendScheduleEntry as EntityManifestFutureSendScheduleEntry, + ManifestRealtimeSessionEntry as EntityManifestRealtimeSessionEntry, ManifestSharedStateEntry as EntityManifestSharedStateEntry, ManifestSourceEntry as EntityManifestSourceEntry, + RealtimeAudioSpan as EntityRealtimeAudioSpan, + RealtimeSession as EntityRealtimeSession, + RealtimeSessionStatus as EntityRealtimeSessionStatus, + RealtimeSessionStreamRefs as EntityRealtimeSessionStreamRefs, + RealtimeTranscript as EntityRealtimeTranscript, Signal as EntitySignalEntry, WakeEntry, } from './entity-schema' @@ -321,8 +327,14 @@ export type ManifestCronScheduleEntry = EntityManifestCronScheduleEntry export type ManifestEffectEntry = EntityManifestEffectEntry export type ManifestFutureSendScheduleEntry = EntityManifestFutureSendScheduleEntry +export type ManifestRealtimeSessionEntry = EntityManifestRealtimeSessionEntry export type ManifestSourceEntry = EntityManifestSourceEntry export type ManifestSharedStateEntry = EntityManifestSharedStateEntry +export type RealtimeSession = EntityRealtimeSession +export type RealtimeSessionStatus = EntityRealtimeSessionStatus +export type RealtimeSessionStreamRefs = EntityRealtimeSessionStreamRefs +export type RealtimeAudioSpan = EntityRealtimeAudioSpan +export type RealtimeTranscript = EntityRealtimeTranscript export type ContextInserted = EntityContextInserted export type ContextRemoved = EntityContextRemoved export type ContextEntryAttrs = EntityContextEntryAttrs diff --git a/packages/agents-server/src/index.ts b/packages/agents-server/src/index.ts index e411f78771..d4392a10d8 100644 --- a/packages/agents-server/src/index.ts +++ b/packages/agents-server/src/index.ts @@ -9,7 +9,10 @@ export type { export { StreamClient } from './stream-client.js' export type { DurableStreamsBearerProvider, + StreamAppendOptions, StreamClientOptions, + StreamIdempotentAppendOptions, + StreamProducerHeaderAppendOptions, SubscriptionClaimResponse, SubscriptionCreateInput, SubscriptionResponse, diff --git a/packages/agents-server/src/stream-client.ts b/packages/agents-server/src/stream-client.ts index 96e92de279..da74d5de8d 100644 --- a/packages/agents-server/src/stream-client.ts +++ b/packages/agents-server/src/stream-client.ts @@ -19,6 +19,26 @@ export interface StreamAppendResult { offset: string } +export interface StreamAppendOptions { + close?: boolean + contentType?: string + batching?: boolean +} + +export interface StreamIdempotentAppendOptions { + producerId: string + epoch?: number + contentType?: string + batching?: boolean +} + +export interface StreamProducerHeaderAppendOptions { + producerId: string + epoch: number + seq: number + contentType?: string +} + export interface StreamMessage { data: Uint8Array offset: string @@ -286,7 +306,7 @@ export class StreamClient { async append( path: string, data: Uint8Array | string, - opts?: { close?: boolean } + opts: StreamAppendOptions = {} ): Promise { return await withSpan(`stream.append`, async (span) => { span.setAttributes({ @@ -296,8 +316,8 @@ export class StreamClient { const handle = new DurableStream({ url: this.streamUrl(path), headers: this.streamHeaders(), - contentType: `application/json`, - batching: false, + contentType: opts.contentType ?? `application/json`, + batching: opts.batching ?? false, }) if (opts?.close) { const result = await handle.close({ body: data }) @@ -313,7 +333,7 @@ export class StreamClient { async appendIdempotent( path: string, data: Uint8Array | string, - opts: { producerId: string; epoch?: number } + opts: StreamIdempotentAppendOptions ): Promise { return await withSpan(`stream.appendIdempotent`, async (span) => { span.setAttributes({ @@ -323,7 +343,8 @@ export class StreamClient { const stream = new DurableStream({ url: this.streamUrl(path), headers: this.streamHeaders(), - contentType: `application/json`, + contentType: opts.contentType ?? `application/json`, + batching: opts.batching, }) const producer = new IdempotentProducer(stream, opts.producerId, { epoch: opts.epoch ?? 0, @@ -341,7 +362,7 @@ export class StreamClient { async appendWithProducerHeaders( path: string, data: Uint8Array | string, - opts: { producerId: string; epoch: number; seq: number } + opts: StreamProducerHeaderAppendOptions ): Promise { return await withSpan(`stream.appendWithProducerHeaders`, async (span) => { span.setAttributes({ @@ -349,7 +370,7 @@ export class StreamClient { [ATTR.STREAM_OP]: `appendWithProducerHeaders`, }) const headers: Record = { - 'content-type': `application/json`, + 'content-type': opts.contentType ?? `application/json`, 'Producer-Id': opts.producerId, 'Producer-Epoch': String(opts.epoch), 'Producer-Seq': String(opts.seq), From ce2ba486dd6f9ca0ee41154854f177d23ba47ee3 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 11:36:25 +0100 Subject: [PATCH 02/31] feat(agents-runtime): add realtime handler API --- .../agents-runtime/src/context-factory.ts | 278 ++++++++++++++++++ packages/agents-runtime/src/index.ts | 16 + packages/agents-runtime/src/realtime.ts | 42 +++ packages/agents-runtime/src/types.ts | 148 ++++++++++ .../test/realtime-context.test.ts | 65 ++++ 5 files changed, 549 insertions(+) create mode 100644 packages/agents-runtime/src/realtime.ts create mode 100644 packages/agents-runtime/test/realtime-context.test.ts diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 006b5c7241..379a9c54c5 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -42,8 +42,14 @@ import type { HandlerWake, LLMMessage, ManifestAttachmentEntry, + ManifestRealtimeSessionEntry, ObservationHandle, ObservationSource, + RealtimeConfig, + RealtimeHandle, + RealtimeProviderEvent, + RealtimeProviderSession, + RealtimeRunResult, RunHandle, SendResult, SharedStateHandle, @@ -66,6 +72,41 @@ function agentModelProvider(config: AgentConfig): string { : config.model.provider } +function isRealtimeSessionManifest( + entry: unknown +): entry is ManifestRealtimeSessionEntry { + return ( + typeof entry === `object` && + entry !== null && + (entry as { kind?: unknown }).kind === `realtime-session` && + typeof (entry as { id?: unknown }).id === `string` + ) +} + +function realtimeManifestIsActive( + entry: ManifestRealtimeSessionEntry +): boolean { + return entry.status === `requested` || entry.status === `active` +} + +function getToolName(tool: AgentTool): string | null { + const name = (tool as { name?: unknown }).name + return typeof name === `string` ? name : null +} + +function applyRealtimeToolPolicy( + tools: Array, + policy: RealtimeConfig[`toolPolicy`] +): Array { + if (!policy) return tools + const allowed = new Set([...(policy.direct ?? []), ...(policy.confirm ?? [])]) + if (allowed.size === 0) return [] + return tools.filter((tool) => { + const name = getToolName(tool) + return name != null && allowed.has(name) + }) +} + const MAX_HYDRATED_IMAGE_ATTACHMENTS = 4 const MAX_HYDRATED_IMAGE_ATTACHMENT_BYTES = 10 * 1024 * 1024 @@ -447,6 +488,8 @@ export function createHandlerContext( ): HandlerContextResult { let sleepRequested = false let agentConfig: AgentConfig | null = null + let realtimeConfig: RealtimeConfig | null = null + let activeRealtimeProviderSession: RealtimeProviderSession | null = null let useContextConfig: UseContextConfig | null = null let useContextHash = `` let useContextRegistrations = 0 @@ -514,6 +557,20 @@ export function createHandlerContext( }, } + function realtimeSessions(): Array { + const sessions: Array = [] + for (const entry of config.db.collections.manifests.toArray) { + if (isRealtimeSessionManifest(entry)) { + sessions.push(entry) + } + } + return sessions.sort((a, b) => a.startedAt.localeCompare(b.startedAt)) + } + + function activeRealtimeSession(): ManifestRealtimeSessionEntry | undefined { + return realtimeSessions().filter(realtimeManifestIsActive).at(-1) + } + function structuralHash(nextConfig: UseContextConfig): string { const sources = Object.entries(nextConfig.sources) .sort(([leftName], [rightName]) => leftName.localeCompare(rightName)) @@ -911,6 +968,219 @@ export function createHandlerContext( }, } + const realtimeHandle: RealtimeHandle = { + async run(): Promise { + if (!realtimeConfig) { + throw new Error( + `[agent-runtime] realtime.run() called without useRealtime().` + ) + } + + if (config.prepareAgentRun) { + await config.prepareAgentRun() + } + + const activeRealtimeConfig = realtimeConfig + const bridge = createOutboundBridge( + await loadOutboundIdSeed(config.db), + config.writeEvent + ) + const startedAt = Date.now() + let textStarted = false + let currentToolCall: + | { toolCallId: string; name: string; args: unknown } + | undefined + + const endText = (): void => { + if (!textStarted) return + bridge.onTextEnd() + textStarted = false + } + + const emitText = (delta: string): void => { + if (delta.length === 0) return + if (!textStarted) { + bridge.onTextStart() + textStarted = true + } + bridge.onTextDelta(delta) + } + + const composedTools = (await composeToolsWithProviders( + activeRealtimeConfig.tools ?? [] + )) as Array + const providerTools = applyRealtimeToolPolicy( + composedTools, + activeRealtimeConfig.toolPolicy + ) + const messages = await hydrateAttachmentBlocks( + timelineToMessages(config.db) + ) + + async function handleProviderEvent( + event: RealtimeProviderEvent + ): Promise { + switch (event.type) { + case `session.started`: + case `session.updated`: + case `input_audio.speech_started`: + case `input_audio.speech_stopped`: + case `input_transcript.delta`: + case `input_transcript.completed`: + case `output_audio.delta`: + case `output_audio.completed`: + case `response.started`: + case `response.cancelled`: + break + + case `session.closed`: + case `response.completed`: + endText() + break + + case `session.error`: + throw new Error( + `[agent-runtime] realtime provider error${event.code ? ` ${event.code}` : ``}: ${event.error}` + ) + + case `output_transcript.delta`: + emitText(event.delta) + break + + case `output_transcript.completed`: + if (!textStarted && event.text) { + emitText(event.text) + } + endText() + break + + case `tool_call.started`: + currentToolCall = { + toolCallId: event.toolCallId, + name: event.name, + args: event.args, + } + if (event.args !== undefined) { + bridge.onToolCallStart(event.toolCallId, event.name, event.args) + } + break + + case `tool_call.arguments_delta`: + break + + case `tool_call.arguments_completed`: + currentToolCall = { + toolCallId: event.toolCallId, + name: event.name, + args: event.args, + } + bridge.onToolCallStart(event.toolCallId, event.name, event.args) + break + + case `tool_call.completed`: { + if (currentToolCall?.toolCallId !== event.toolCallId) { + bridge.onToolCallStart(event.toolCallId, event.name, undefined) + } + bridge.onToolCallEnd( + event.toolCallId, + event.name, + event.result, + event.isError ?? false + ) + break + } + } + } + + try { + bridge.onRunStart() + bridge.onStepStart({ + modelProvider: activeRealtimeConfig.provider.id, + modelId: activeRealtimeConfig.provider.model, + }) + + if (activeRealtimeConfig.testResponses) { + const messageText = getTriggerMessageText( + config.db, + config.wakeEvent, + config.events, + config.wakeOffset, + config.hydratedEventSourceWake + ) + const responses = activeRealtimeConfig.testResponses + if (Array.isArray(responses)) { + const priorRunCount = ( + await queryOnce((q) => + q.from({ runs: config.db.collections.runs }) + ) + ).length + emitText( + responses[priorRunCount % Math.max(responses.length, 1)] ?? `` + ) + } else { + const response = await responses(messageText, bridge) + if (response !== undefined) emitText(response) + } + endText() + } else { + activeRealtimeProviderSession = + await activeRealtimeConfig.provider.connect({ + systemPrompt: activeRealtimeConfig.systemPrompt, + messages, + tools: providerTools, + audio: activeRealtimeConfig.audio, + session: activeRealtimeSession(), + signal: config.runSignal, + }) + + for await (const event of activeRealtimeProviderSession.events) { + if (config.runSignal?.aborted) { + break + } + await handleProviderEvent(event) + } + } + + endText() + bridge.onStepEnd({ + finishReason: config.runSignal?.aborted ? `aborted` : `stop`, + durationMs: Date.now() - startedAt, + }) + bridge.onRunEnd({ + finishReason: config.runSignal?.aborted ? `aborted` : `stop`, + }) + } catch (error) { + endText() + bridge.onStepEnd({ + finishReason: `error`, + durationMs: Date.now() - startedAt, + }) + bridge.onRunEnd({ finishReason: `error` }) + throw error + } finally { + activeRealtimeProviderSession = null + } + + return { + writes: [], + toolCalls: [], + usage: { tokens: 0, duration: Date.now() - startedAt }, + } + }, + async close(reason?: string): Promise { + await activeRealtimeProviderSession?.close?.(reason) + }, + async stop(reason?: string): Promise { + await this.close(reason) + }, + async cancelResponse(): Promise { + await activeRealtimeProviderSession?.cancelResponse?.() + }, + async sendText(text: string): Promise { + await activeRealtimeProviderSession?.sendText?.(text) + }, + } + const ctx: DebugHandlerContext = { firstWake: config.firstWake, wake: toHandlerWake(config.wakeEvent), @@ -931,6 +1201,10 @@ export function createHandlerContext( agentConfig = cfg return agent }, + useRealtime(cfg) { + realtimeConfig = cfg + return realtimeHandle + }, useContext(nextConfig) { assertValidUseContextConfig(nextConfig) const hash = structuralHash(nextConfig) @@ -951,6 +1225,10 @@ export function createHandlerContext( useContextRegistrations: () => useContextRegistrations, }, agent, + realtime: { + activeSession: activeRealtimeSession, + sessions: realtimeSessions, + }, observe: ((source: ObservationSource, opts?: { wake?: Wake }) => { return config.doObserve(source, opts?.wake) as Promise< ObservationHandle & EntityHandle & SharedStateHandle diff --git a/packages/agents-runtime/src/index.ts b/packages/agents-runtime/src/index.ts index e4015ee3e1..fe065de595 100644 --- a/packages/agents-runtime/src/index.ts +++ b/packages/agents-runtime/src/index.ts @@ -13,9 +13,23 @@ export type { ManifestSourceEntry, ManifestSharedStateEntry, RealtimeAudioSpan, + RealtimeAudioConfig, + RealtimeAudioFormat, + RealtimeConfig, + RealtimeContextConfig, + RealtimeHandle, + RealtimeHelpers, + RealtimeProviderConfig, + RealtimeProviderConnectInput, + RealtimeProviderEvent, + RealtimeProviderSession, + RealtimeRunResult, RealtimeSession, + RealtimeSessionPolicy, RealtimeSessionStatus, RealtimeSessionStreamRefs, + RealtimeToolPolicy, + RealtimeToolResult, RealtimeTranscript, PendingSend, EffectConfig, @@ -130,6 +144,8 @@ export type { } from './entity-schema' export { createEntityStreamDB } from './entity-stream-db' +export { createTestRealtimeProvider } from './realtime' +export type { TestRealtimeProviderOptions } from './realtime' export { getEntityAttachmentStreamPath, manifestAttachmentKey, diff --git a/packages/agents-runtime/src/realtime.ts b/packages/agents-runtime/src/realtime.ts new file mode 100644 index 0000000000..5916d4ebd6 --- /dev/null +++ b/packages/agents-runtime/src/realtime.ts @@ -0,0 +1,42 @@ +import type { RealtimeProviderConfig, RealtimeProviderEvent } from './types' + +export interface TestRealtimeProviderOptions { + model?: string + events?: Array + response?: string +} + +export function createTestRealtimeProvider( + opts: TestRealtimeProviderOptions = {} +): RealtimeProviderConfig { + return { + id: `test`, + model: opts.model ?? `test-realtime`, + async connect() { + const events = + opts.events ?? + (opts.response != null + ? [ + { type: `session.started` as const }, + { + type: `output_transcript.completed` as const, + text: opts.response, + }, + { type: `response.completed` as const }, + { type: `session.closed` as const }, + ] + : [ + { type: `session.started` as const }, + { type: `session.closed` as const }, + ]) + + return { + events: (async function* () { + for (const event of events) { + yield event + } + })(), + } + }, + } +} diff --git a/packages/agents-runtime/src/types.ts b/packages/agents-runtime/src/types.ts index 48c3d44bc3..53d875f4e8 100644 --- a/packages/agents-runtime/src/types.ts +++ b/packages/agents-runtime/src/types.ts @@ -931,6 +931,152 @@ export interface AgentConfig { testResponses?: TestResponses } +export type RealtimeAudioCodec = `pcm16` + +export interface RealtimeAudioFormat { + codec: RealtimeAudioCodec + sampleRate: number + channels: number +} + +export interface RealtimeAudioConfig { + inputFormat?: RealtimeAudioFormat + outputFormat?: RealtimeAudioFormat +} + +export interface RealtimeToolPolicy { + direct?: Array + confirm?: Array + delegate?: Array +} + +export interface RealtimeSessionPolicy { + textDuringSession?: `route-to-realtime` + retention?: `forever` +} + +export interface RealtimeContextConfig { + includeTimeline?: boolean +} + +export type RealtimeProviderEvent = + | { type: `session.started`; sessionId?: string } + | { type: `session.updated` } + | { type: `session.closed`; reason?: string } + | { type: `session.error`; error: string; code?: string } + | { type: `input_audio.speech_started`; audioOffset?: string } + | { type: `input_audio.speech_stopped`; audioOffset?: string } + | { type: `input_transcript.delta`; delta: string; turnId?: string } + | { type: `input_transcript.completed`; text: string; turnId?: string } + | { + type: `output_audio.delta` + audio: Uint8Array + responseId?: string + itemId?: string + } + | { type: `output_audio.completed`; responseId?: string; itemId?: string } + | { type: `output_transcript.delta`; delta: string; responseId?: string } + | { + type: `output_transcript.completed` + text?: string + responseId?: string + } + | { type: `response.started`; responseId?: string } + | { type: `response.completed`; responseId?: string } + | { type: `response.cancelled`; responseId?: string } + | { + type: `tool_call.started` + toolCallId: string + name: string + args?: unknown + } + | { + type: `tool_call.arguments_delta` + toolCallId: string + delta: string + } + | { + type: `tool_call.arguments_completed` + toolCallId: string + name: string + args: unknown + } + | { + type: `tool_call.completed` + toolCallId: string + name: string + result: unknown + isError?: boolean + } + +export interface RealtimeProviderConnectInput { + systemPrompt: string + messages: Array + tools: Array + audio?: RealtimeAudioConfig + session?: ManifestRealtimeSessionEntry + signal?: AbortSignal +} + +export interface RealtimeToolResult { + toolCallId: string + name: string + result: unknown + isError?: boolean +} + +export interface RealtimeProviderSession { + events: AsyncIterable + updateSession?: (update: unknown) => Promise + appendInputAudio?: ( + chunk: Uint8Array, + meta?: Record + ) => Promise + commitInputAudio?: () => Promise + sendText?: (text: string) => Promise + sendToolResult?: (result: RealtimeToolResult) => Promise + cancelResponse?: () => Promise + truncateOutputAudio?: (opts: { + itemId: string + audioEndMs: number + }) => Promise + close?: (reason?: string) => Promise +} + +export interface RealtimeProviderConfig { + id: string + model: string + connect: ( + input: RealtimeProviderConnectInput + ) => Promise +} + +export interface RealtimeConfig { + systemPrompt: string + provider: RealtimeProviderConfig + tools?: Array + audio?: RealtimeAudioConfig + toolPolicy?: RealtimeToolPolicy + context?: RealtimeContextConfig + session?: RealtimeSessionPolicy + testResponses?: TestResponses +} + +export type RealtimeRunResult = AgentRunResult + +export interface RealtimeHandle { + run: () => Promise + close: (reason?: string) => Promise + stop: (reason?: string) => Promise + cancelResponse: (opts?: { truncateAudio?: boolean }) => Promise + sendText: (text: string) => Promise +} + +export interface RealtimeHelpers { + activeSession: () => ManifestRealtimeSessionEntry | undefined + sessions: () => Array +} + export type TestResponses = Array | TestResponseFn export type TestResponseFn = ( @@ -1030,6 +1176,7 @@ export interface HandlerContext< */ sandbox: Sandbox useAgent: (config: AgentConfig) => AgentHandle + useRealtime: (config: RealtimeConfig) => RealtimeHandle useContext: (config: UseContextConfig) => void timelineMessages: (opts?: TimelineProjectionOpts) => Array insertContext: (id: string, entry: ContextEntryInput) => void @@ -1037,6 +1184,7 @@ export interface HandlerContext< getContext: (id: string) => ContextEntry | undefined listContext: () => Array agent: AgentHandle + realtime: RealtimeHelpers spawn: ( type: string, id: string, diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts new file mode 100644 index 0000000000..9e5ebfac2d --- /dev/null +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -0,0 +1,65 @@ +import { describe, expect, it } from 'vitest' +import { createTestRealtimeProvider } from '../src/realtime' +import { createTestHandlerContext } from './helpers/context-test-helpers' + +describe(`ctx.useRealtime()`, () => { + it(`records provider transcript output through the outbound bridge`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ response: `hello from voice` }), + tools: [], + }) + + await realtime.run() + + expect(ctx.db.collections.runs.toArray).toMatchObject([ + { key: `run-0`, status: `completed`, finish_reason: `stop` }, + ]) + expect(ctx.db.collections.steps.toArray).toMatchObject([ + { + key: `step-0`, + run_id: `run-0`, + model_provider: `test`, + model_id: `test-realtime`, + status: `completed`, + finish_reason: `stop`, + }, + ]) + expect(ctx.db.collections.textDeltas.toArray).toMatchObject([ + { + text_id: `msg-0`, + run_id: `run-0`, + delta: `hello from voice`, + }, + ]) + }) + + it(`finds active realtime sessions from the manifest`, () => { + const { ctx } = createTestHandlerContext() + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + + expect(ctx.realtime.activeSession()).toMatchObject({ + id: `rt-1`, + status: `active`, + }) + }) +}) From 213d1973f78fff8937a0fe440751790697cea928 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 11:45:06 +0100 Subject: [PATCH 03/31] feat(agents-server): add realtime session route --- packages/agents-server/src/entity-manager.ts | 178 ++++++++++++++++++ packages/agents-server/src/index.ts | 4 + .../src/routing/internal-router.ts | 2 + .../src/routing/realtime-router.ts | 89 +++++++++ ...ic-agents-manager-write-validation.test.ts | 96 ++++++++++ 5 files changed, 369 insertions(+) create mode 100644 packages/agents-server/src/routing/realtime-router.ts diff --git a/packages/agents-server/src/entity-manager.ts b/packages/agents-server/src/entity-manager.ts index 26700b246f..f3a09b82a9 100644 --- a/packages/agents-server/src/entity-manager.ts +++ b/packages/agents-server/src/entity-manager.ts @@ -114,6 +114,35 @@ type ServerSignalEvent = { txid: string } } +type RealtimeAudioRequest = { + codec?: `pcm16` + sampleRate?: number + channels?: number +} + +export type RealtimeSessionCreateRequest = { + id?: string + provider: string + model: string + inputAudio?: RealtimeAudioRequest + outputAudio?: RealtimeAudioRequest + meta?: Record +} + +export type RealtimeSessionCreateResult = { + sessionId: string + entityUrl: string + provider: string + model: string + status: `requested` + startedAt: string + streams: { + audio_in: string + audio_out: string + control_in: string + control_out: string + } +} type AttachmentSubjectType = `inbox` | `run` | `text` | `tool_call` | `context` type AttachmentRole = `input` | `output` @@ -254,6 +283,19 @@ function getEntityAttachmentStreamPath( return `${entityUrl.replace(/\/+$/, ``)}/attachments/${attachmentId}` } +function getRealtimeSessionBasePath( + entityUrl: string, + sessionId: string +): string { + return `${entityUrl.replace(/\/+$/, ``)}/realtime/${sessionId}` +} + +function realtimeAudioContentType(audio?: RealtimeAudioRequest): string { + const sampleRate = audio?.sampleRate ?? 24_000 + const channels = audio?.channels ?? 1 + return `audio/pcm; rate=${sampleRate}; channels=${channels}` +} + function isStreamCreateConflict(error: unknown): boolean { return ( !!error && @@ -286,6 +328,16 @@ function validateAttachmentId(id: string): void { } } +function validateRealtimeSessionId(id: string): void { + if (!id || id.includes(`/`) || id.startsWith(`.`)) { + throw new ElectricAgentsError( + ErrCodeInvalidRequest, + `realtime session id must not be empty, start with ".", or contain forward slashes`, + 400 + ) + } +} + function validateAttachmentSubject( subject: CreateAttachmentRequest[`subject`] ): void { @@ -2493,6 +2545,132 @@ export class EntityManager { ) } + async createRealtimeSession( + entityUrl: string, + req: RealtimeSessionCreateRequest + ): Promise { + const entity = await this.registry.getEntity(entityUrl) + if (!entity) { + throw new ElectricAgentsError(ErrCodeNotFound, `Entity not found`, 404) + } + if (rejectsNormalWrites(entity.status)) { + throw new ElectricAgentsError( + ErrCodeNotRunning, + `Entity is not accepting writes`, + 409 + ) + } + if (this.isForkWorkLockedEntity(entityUrl)) { + this.assertEntityNotForkWorkLocked(entityUrl) + } + + const provider = req.provider.trim() + const model = req.model.trim() + if (!provider || !model) { + throw new ElectricAgentsError( + ErrCodeInvalidRequest, + `provider and model are required`, + 400 + ) + } + + const sessionId = req.id ?? `rt-${randomUUID()}` + validateRealtimeSessionId(sessionId) + + const basePath = getRealtimeSessionBasePath(entityUrl, sessionId) + const streams = { + audio_in: `${basePath}/audio/in`, + audio_out: `${basePath}/audio/out`, + control_in: `${basePath}/control/in`, + control_out: `${basePath}/control/out`, + } + const startedAt = new Date().toISOString() + const manifestKey = `realtime-session:${sessionId}` + const txid = randomUUID() + const createdStreams: Array = [] + + try { + for (const [path, contentType] of [ + [streams.audio_in, realtimeAudioContentType(req.inputAudio)], + [streams.audio_out, realtimeAudioContentType(req.outputAudio)], + [streams.control_in, `application/json`], + [streams.control_out, `application/json`], + ] as const) { + await this.streamClient.create(path, { contentType }) + createdStreams.push(path) + } + + await this.writeManifestEntry( + entityUrl, + manifestKey, + `upsert`, + { + kind: `realtime-session`, + id: sessionId, + provider, + model, + status: `requested`, + startedAt, + endedAt: null, + streams, + retention: `forever`, + ...(req.meta ? { meta: req.meta } : {}), + }, + { txid } + ) + + const sessionEvent = entityStateSchema.realtimeSessions.insert({ + key: manifestKey, + value: { + session_id: sessionId, + provider, + model, + status: `requested`, + started_at: startedAt, + streams, + ...(req.meta ? { meta: req.meta } : {}), + }, + } as never) + await this.streamClient.append( + entity.streams.main, + this.encodeChangeEvent(sessionEvent as Record) + ) + + await this.send(entityUrl, { + from: SERVER_SIGNAL_SENDER, + payload: { + type: `realtime_session.started`, + sessionId, + provider, + model, + streams, + }, + }) + } catch (error) { + await Promise.allSettled( + createdStreams.map((path) => this.streamClient.delete(path)) + ) + if (isStreamCreateConflict(error)) { + throw new ElectricAgentsError( + ErrCodeInvalidRequest, + `Realtime session already exists at id "${sessionId}"`, + 409 + ) + } + throw error + } + + return { + sessionId, + entityUrl, + provider, + model, + status: `requested`, + startedAt, + streams, + } + } + // ========================================================================== // Attachments // ========================================================================== diff --git a/packages/agents-server/src/index.ts b/packages/agents-server/src/index.ts index d4392a10d8..ef6c43527d 100644 --- a/packages/agents-server/src/index.ts +++ b/packages/agents-server/src/index.ts @@ -1,6 +1,10 @@ export { createDb, runMigrations } from './db/index.js' export type { DrizzleDB, PgClient } from './db/index.js' export { AgentsHost } from './host.js' +export type { + RealtimeSessionCreateRequest, + RealtimeSessionCreateResult, +} from './entity-manager.js' export type { AgentsHostOptions, AgentsHostTenantConfig, diff --git a/packages/agents-server/src/routing/internal-router.ts b/packages/agents-server/src/routing/internal-router.ts index 43970704d9..2434001134 100644 --- a/packages/agents-server/src/routing/internal-router.ts +++ b/packages/agents-server/src/routing/internal-router.ts @@ -32,6 +32,7 @@ import { entitiesRouter } from './entities-router.js' import { entityTypesRouter } from './entity-types-router.js' import { getRequestSpan } from './hooks.js' import { observationsRouter } from './observations-router.js' +import { realtimeRouter } from './realtime-router.js' import { runnersRouter } from './runners-router.js' import { routeBody, validateOptionalJsonBody, withSchema } from './schema.js' import { withLeadingSlash } from './tenant-stream-paths.js' @@ -136,6 +137,7 @@ internalRouter.all(`/runners/*`, runnersRouter.fetch) internalRouter.all(`/entities/*`, entitiesRouter.fetch) internalRouter.all(`/entity-types/*`, entityTypesRouter.fetch) internalRouter.all(`/observations/*`, observationsRouter.fetch) +internalRouter.all(`/realtime/*`, realtimeRouter.fetch) internalRouter.get(`/electric/*`, electricProxyRouter.fetch) internalRouter.all(`*`, () => status(404)) diff --git a/packages/agents-server/src/routing/realtime-router.ts b/packages/agents-server/src/routing/realtime-router.ts new file mode 100644 index 0000000000..7d65b99d8c --- /dev/null +++ b/packages/agents-server/src/routing/realtime-router.ts @@ -0,0 +1,89 @@ +/** + * HTTP routes for realtime session management. + */ + +import { Type, type Static } from '@sinclair/typebox' +import { Router, json } from 'itty-router' +import { apiError } from '../electric-agents-http.js' +import { + ErrCodeNotFound, + ErrCodeUnauthorized, +} from '../electric-agents-types.js' +import { canAccessEntity } from '../permissions.js' +import { routeBody, withSchema } from './schema.js' +import type { JsonRouteRequest } from './schema.js' +import type { RouterType } from 'itty-router' +import type { TenantContext } from './context.js' + +interface RealtimeRouteRequest extends JsonRouteRequest {} + +type RealtimeRouteArgs = [TenantContext] +type RealtimeRouteResult = Response | undefined + +export type RealtimeRoutes = RouterType< + RealtimeRouteRequest, + RealtimeRouteArgs, + RealtimeRouteResult +> + +const realtimeAudioRequestSchema = Type.Object( + { + codec: Type.Optional(Type.Literal(`pcm16`)), + sampleRate: Type.Optional(Type.Number()), + channels: Type.Optional(Type.Number()), + }, + { additionalProperties: false } +) + +const realtimeSessionCreateBodySchema = Type.Object( + { + entityUrl: Type.String(), + id: Type.Optional(Type.String()), + provider: Type.String(), + model: Type.String(), + inputAudio: Type.Optional(realtimeAudioRequestSchema), + outputAudio: Type.Optional(realtimeAudioRequestSchema), + meta: Type.Optional(Type.Record(Type.String(), Type.Unknown())), + }, + { additionalProperties: false } +) + +type RealtimeSessionCreateBody = Static + +export const realtimeRouter: RealtimeRoutes = Router< + RealtimeRouteRequest, + RealtimeRouteArgs, + RealtimeRouteResult +>({ + base: `/_electric/realtime`, +}) + +realtimeRouter.post( + `/sessions`, + withSchema(realtimeSessionCreateBodySchema), + createRealtimeSession +) + +async function createRealtimeSession( + request: RealtimeRouteRequest, + ctx: TenantContext +): Promise { + const parsed = routeBody(request) + const entity = await ctx.entityManager.registry.getEntity(parsed.entityUrl) + if (!entity) { + return apiError(404, ErrCodeNotFound, `Entity not found`) + } + if (!(await canAccessEntity(ctx, entity, `write`, request as Request))) { + return apiError( + 401, + ErrCodeUnauthorized, + `Principal is not allowed to write ${entity.url}` + ) + } + + const result = await ctx.entityManager.createRealtimeSession( + parsed.entityUrl, + parsed + ) + return json(result, { status: 201 }) +} diff --git a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts index 4cab89f72b..ffa8ec7fc9 100644 --- a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts +++ b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts @@ -94,6 +94,13 @@ function attachmentManifest(value: Record) { } } +function decodeAppend(call: unknown[]): Record { + const body = call[1] + const bytes = + body instanceof Uint8Array ? body : new TextEncoder().encode(String(body)) + return JSON.parse(new TextDecoder().decode(bytes)) as Record +} + describe(`ElectricAgentsManager.validateWriteEvent`, () => { it(`validates delete events against old_value instead of value`, async () => { const manager = createManager() @@ -140,6 +147,95 @@ describe(`ElectricAgentsManager.validateWriteEvent`, () => { }) }) +describe(`ElectricAgentsManager realtime sessions`, () => { + it(`creates durable IO streams and records a replayable session manifest`, async () => { + const create = vi.fn() + const append = vi.fn() + const { manager } = createAttachmentManager({ + streamClient: { create, append }, + }) + + const result = await manager.createRealtimeSession(`/chat/session-1`, { + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, + outputAudio: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + meta: { source: `test` }, + }) + + expect(result.streams).toEqual({ + audio_in: `/chat/session-1/realtime/rt-1/audio/in`, + audio_out: `/chat/session-1/realtime/rt-1/audio/out`, + control_in: `/chat/session-1/realtime/rt-1/control/in`, + control_out: `/chat/session-1/realtime/rt-1/control/out`, + }) + expect(create).toHaveBeenCalledTimes(4) + expect(create.mock.calls).toEqual([ + [ + `/chat/session-1/realtime/rt-1/audio/in`, + { contentType: `audio/pcm; rate=16000; channels=1` }, + ], + [ + `/chat/session-1/realtime/rt-1/audio/out`, + { contentType: `audio/pcm; rate=24000; channels=1` }, + ], + [ + `/chat/session-1/realtime/rt-1/control/in`, + { contentType: `application/json` }, + ], + [ + `/chat/session-1/realtime/rt-1/control/out`, + { contentType: `application/json` }, + ], + ]) + + expect(append).toHaveBeenCalledTimes(3) + const manifestEvent = decodeAppend(append.mock.calls[0]!) + const sessionEvent = decodeAppend(append.mock.calls[1]!) + const inboxEvent = decodeAppend(append.mock.calls[2]!) + + expect(manifestEvent).toMatchObject({ + type: `manifest`, + key: `realtime-session:rt-1`, + headers: { operation: `upsert` }, + value: { + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + streams: result.streams, + retention: `forever`, + meta: { source: `test` }, + }, + }) + expect(sessionEvent).toMatchObject({ + type: `realtime_session`, + key: `realtime-session:rt-1`, + value: { + session_id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + streams: result.streams, + }, + }) + expect(inboxEvent).toMatchObject({ + type: `inbox`, + value: { + from: `/_electric/server`, + payload: { + type: `realtime_session.started`, + sessionId: `rt-1`, + streams: result.streams, + }, + }, + }) + expect(inboxEvent.value).not.toHaveProperty(`message_type`) + }) +}) + describe(`ElectricAgentsManager attachments`, () => { it(`does not delete an existing stream when duplicate attachment creation conflicts`, async () => { const create = vi.fn().mockRejectedValue({ status: 409 }) From da2308629d9620ceaed0141f150ec5d1a047ff76 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 11:47:43 +0100 Subject: [PATCH 04/31] feat(agents-runtime): add realtime session client --- packages/agents-runtime/src/agents-client.ts | 9 +++ packages/agents-runtime/src/client.ts | 5 ++ packages/agents-runtime/src/index.ts | 3 + .../src/runtime-server-client.ts | 49 ++++++++++++ .../test/electric-agents-client.test.ts | 41 ++++++++++ ...time-server-client-update-metadata.test.ts | 78 +++++++++++++++++++ 6 files changed, 185 insertions(+) diff --git a/packages/agents-runtime/src/agents-client.ts b/packages/agents-runtime/src/agents-client.ts index dd3358abca..6e86b6f9a5 100644 --- a/packages/agents-runtime/src/agents-client.ts +++ b/packages/agents-runtime/src/agents-client.ts @@ -4,6 +4,10 @@ import { normalizeObservationSchema } from './observation-schema' import { createRuntimeServerClient } from './runtime-server-client' import { appendPathToUrl } from './url' import type { EntitySignal } from './runtime-server-client' +import type { + RealtimeSessionStartResult, + StartRealtimeSessionOptions, +} from './runtime-server-client' import type { EntitiesObservationSource, EntityObservationSource, @@ -31,6 +35,9 @@ export interface AgentsClient { payload?: unknown }) => Promise<{ txid: number }> kill: (entityUrl: string, reason?: string) => Promise<{ txid: number }> + startRealtimeSession: ( + options: StartRealtimeSessionOptions + ) => Promise } export function createAgentsClient(config: AgentsClientConfig): AgentsClient { @@ -44,6 +51,8 @@ export function createAgentsClient(config: AgentsClientConfig): AgentsClient { signal: `SIGKILL`, reason, }), + startRealtimeSession: (options) => + serverClient.startRealtimeSession(options), async observe(source) { if (source.sourceType === `entity`) { const info = await serverClient.getEntity( diff --git a/packages/agents-runtime/src/client.ts b/packages/agents-runtime/src/client.ts index c222461724..3aa0d6e8d4 100644 --- a/packages/agents-runtime/src/client.ts +++ b/packages/agents-runtime/src/client.ts @@ -48,6 +48,11 @@ export type { SlashCommandTrigger, } from './composer-input' export type { AgentsClient, AgentsClientConfig } from './agents-client' +export type { + RealtimeAudioOptions, + RealtimeSessionStartResult, + StartRealtimeSessionOptions, +} from './runtime-server-client' export type { AttachmentRole, AttachmentStatus, diff --git a/packages/agents-runtime/src/index.ts b/packages/agents-runtime/src/index.ts index fe065de595..ab8233d724 100644 --- a/packages/agents-runtime/src/index.ts +++ b/packages/agents-runtime/src/index.ts @@ -278,6 +278,9 @@ export type { DispatchPolicy, SpawnEntityOptions, SendEntityMessageOptions, + RealtimeAudioOptions, + RealtimeSessionStartResult, + StartRealtimeSessionOptions, } from './runtime-server-client' export { buildEventSourceManifestEntry, diff --git a/packages/agents-runtime/src/runtime-server-client.ts b/packages/agents-runtime/src/runtime-server-client.ts index 68134257d5..9c8c406e03 100644 --- a/packages/agents-runtime/src/runtime-server-client.ts +++ b/packages/agents-runtime/src/runtime-server-client.ts @@ -8,6 +8,7 @@ import type { ManifestAttachmentEntry, } from './types' import type { EntitySignal } from './entity-schema' +import type { RealtimeSessionStreamRefs } from './entity-schema' import type { EventSourceContract, EventSourceSubscription, @@ -91,6 +92,32 @@ export interface SendEntityMessageOptions { writeToken?: string } +export interface RealtimeAudioOptions { + codec?: `pcm16` + sampleRate?: number + channels?: number +} + +export interface StartRealtimeSessionOptions { + entityUrl: string + id?: string + provider: string + model: string + inputAudio?: RealtimeAudioOptions + outputAudio?: RealtimeAudioOptions + meta?: Record +} + +export interface RealtimeSessionStartResult { + sessionId: string + entityUrl: string + provider: string + model: string + status: `requested` + startedAt: string + streams: RealtimeSessionStreamRefs +} + export interface RegisterWakeOptions { subscriberUrl: string sourceUrl: string @@ -116,6 +143,9 @@ export interface SignalEntityOptions { export interface RuntimeServerClient { sendEntityMessage: (options: SendEntityMessageOptions) => Promise + startRealtimeSession: ( + options: StartRealtimeSessionOptions + ) => Promise createAttachment: (options: { entityUrl: string attachment: AttachmentCreateInput @@ -358,6 +388,24 @@ export function createRuntimeServerClient( } } + const startRealtimeSession = async ( + options: StartRealtimeSessionOptions + ): Promise => { + const response = await request(`/_electric/realtime/sessions`, { + method: `POST`, + headers: { 'content-type': `application/json` }, + body: JSON.stringify(options), + }) + + if (!response.ok) { + throw new Error( + `startRealtimeSession ${options.entityUrl} failed (${response.status}): ${await readErrorText(response)}` + ) + } + + return (await response.json()) as RealtimeSessionStartResult + } + const createAttachment = async ({ entityUrl, attachment, @@ -875,6 +923,7 @@ export function createRuntimeServerClient( return { sendEntityMessage, + startRealtimeSession, createAttachment, readAttachment, spawnEntity, diff --git a/packages/agents-runtime/test/electric-agents-client.test.ts b/packages/agents-runtime/test/electric-agents-client.test.ts index 7e60b9c9c3..309d7512df 100644 --- a/packages/agents-runtime/test/electric-agents-client.test.ts +++ b/packages/agents-runtime/test/electric-agents-client.test.ts @@ -8,6 +8,7 @@ const { mockState } = vi.hoisted(() => ({ ensureEntitiesMembershipStream: vi.fn(), ensureCronStream: vi.fn(), signalEntity: vi.fn(), + startRealtimeSession: vi.fn(), ensureStream: vi.fn(), createStreamDB: vi.fn(), preload: vi.fn(), @@ -25,6 +26,7 @@ vi.mock(`../src/runtime-server-client`, () => ({ ensureEntitiesMembershipStream: mockState.ensureEntitiesMembershipStream, ensureCronStream: mockState.ensureCronStream, signalEntity: mockState.signalEntity, + startRealtimeSession: mockState.startRealtimeSession, ensureStream: mockState.ensureStream, }), })) @@ -49,6 +51,20 @@ describe(`createAgentsClient`, () => { mockState.ensureStream = vi.fn().mockResolvedValue(`/_webhooks/repo`) mockState.createStreamDB = vi.fn() mockState.signalEntity = vi.fn().mockResolvedValue({ txid: 123 }) + mockState.startRealtimeSession = vi.fn().mockResolvedValue({ + sessionId: `rt-1`, + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T10:00:00.000Z`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + audio_out: `/horton/demo/realtime/rt-1/audio/out`, + control_in: `/horton/demo/realtime/rt-1/control/in`, + control_out: `/horton/demo/realtime/rt-1/control/out`, + }, + }) mockState.observedDb = { preload: vi.fn().mockResolvedValue(undefined), collections: { @@ -151,6 +167,31 @@ describe(`createAgentsClient`, () => { }) }) + it(`exposes realtime session start through the server client`, async () => { + const client = createAgentsClient({ + baseUrl: `http://electric-agents.test`, + }) + + await expect( + client.startRealtimeSession({ + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + }) + ).resolves.toMatchObject({ + sessionId: `rt-1`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + }, + }) + + expect(mockState.startRealtimeSession).toHaveBeenCalledWith({ + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + }) + }) + it(`observe(webhook(...)) ensures the exact stream before preloading it`, async () => { const client = createAgentsClient({ baseUrl: `http://electric-agents.test/t/tenant-a/v1`, diff --git a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts index 89e3839644..c88a9ebad4 100644 --- a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts +++ b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts @@ -136,6 +136,84 @@ describe(`runtime-server-client.setTag`, () => { }) }) +describe(`runtime-server-client realtime sessions`, () => { + it(`starts a realtime session through the control-plane route`, async () => { + const calls: Array<{ url: string; init?: RequestInit }> = [] + const responseBody = { + sessionId: `rt-1`, + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T10:00:00.000Z`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + audio_out: `/horton/demo/realtime/rt-1/audio/out`, + control_in: `/horton/demo/realtime/rt-1/control/in`, + control_out: `/horton/demo/realtime/rt-1/control/out`, + }, + } + const fakeFetch = vi.fn(async (url: string, init?: RequestInit) => { + calls.push({ url, init }) + return new Response(JSON.stringify(responseBody), { + status: 201, + headers: { 'content-type': `application/json` }, + }) + }) as unknown as typeof fetch + const client = createRuntimeServerClient({ + baseUrl: `http://test.example/t/tenant-a/v1`, + fetch: fakeFetch, + principalKey: `user:sam`, + }) + + await expect( + client.startRealtimeSession({ + entityUrl: `/horton/demo`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, + meta: { source: `button` }, + }) + ).resolves.toEqual(responseBody) + + expect(calls).toHaveLength(1) + expect(calls[0]!.url).toBe( + `http://test.example/t/tenant-a/v1/_electric/realtime/sessions` + ) + expect(calls[0]!.init?.method).toBe(`POST`) + const headers = new Headers(calls[0]!.init?.headers) + expect(headers.get(`content-type`)).toBe(`application/json`) + expect(headers.get(`electric-principal`)).toBe(`user:sam`) + expect(JSON.parse(calls[0]!.init!.body as string)).toEqual({ + entityUrl: `/horton/demo`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, + meta: { source: `button` }, + }) + }) + + it(`surfaces realtime session start failures`, async () => { + const fakeFetch = vi.fn( + async () => new Response(`not allowed`, { status: 401 }) + ) as unknown as typeof fetch + const client = createRuntimeServerClient({ + baseUrl: `http://test.example`, + fetch: fakeFetch, + }) + + await expect( + client.startRealtimeSession({ + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + }) + ).rejects.toThrow(/startRealtimeSession.*401.*not allowed/) + }) +}) + describe(`runtime-server-client event sources`, () => { it(`lists event sources from the runtime server`, async () => { const fakeFetch = vi.fn( From 19596dbb8a156179d9258503c9085e22e3ca8f9e Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 11:58:07 +0100 Subject: [PATCH 05/31] feat(agents-runtime): add openai realtime provider --- packages/agents-runtime/src/index.ts | 2 + .../agents-runtime/src/openai-realtime.ts | 615 ++++++++++++++++++ .../test/openai-realtime.test.ts | 217 ++++++ 3 files changed, 834 insertions(+) create mode 100644 packages/agents-runtime/src/openai-realtime.ts create mode 100644 packages/agents-runtime/test/openai-realtime.test.ts diff --git a/packages/agents-runtime/src/index.ts b/packages/agents-runtime/src/index.ts index ab8233d724..33d5ce3e21 100644 --- a/packages/agents-runtime/src/index.ts +++ b/packages/agents-runtime/src/index.ts @@ -146,6 +146,8 @@ export type { export { createEntityStreamDB } from './entity-stream-db' export { createTestRealtimeProvider } from './realtime' export type { TestRealtimeProviderOptions } from './realtime' +export { createOpenAIRealtimeProvider } from './openai-realtime' +export type { OpenAIRealtimeProviderOptions } from './openai-realtime' export { getEntityAttachmentStreamPath, manifestAttachmentKey, diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts new file mode 100644 index 0000000000..649fe1a25c --- /dev/null +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -0,0 +1,615 @@ +import type { + AgentTool, + LLMMessage, + RealtimeAudioFormat, + RealtimeProviderConfig, + RealtimeProviderConnectInput, + RealtimeProviderEvent, + RealtimeProviderSession, + RealtimeToolResult, +} from './types' + +type MaybePromise = T | Promise +type OpenAIRealtimeSocket = { + send: (data: string) => void + close?: (code?: number, reason?: string) => void + addEventListener?: ( + event: string, + handler: (...args: Array) => void + ) => void + removeEventListener?: ( + event: string, + handler: (...args: Array) => void + ) => void + on?: (event: string, handler: (...args: Array) => void) => void + off?: (event: string, handler: (...args: Array) => void) => void + readyState?: number +} +type OpenAIRealtimeWebSocketConstructor = new ( + url: string, + init?: unknown +) => OpenAIRealtimeSocket + +export interface OpenAIRealtimeProviderOptions { + apiKey: string | (() => MaybePromise) + model?: string + url?: string + voice?: string + safetyIdentifier?: string + headers?: Record + WebSocket?: OpenAIRealtimeWebSocketConstructor +} + +type OpenAIRealtimeEvent = Record & { type?: string } + +class AsyncEventQueue implements AsyncIterable { + private values: Array = [] + private resolvers: Array<(value: IteratorResult) => void> = [] + private closed = false + private error: unknown + + push(value: T): void { + if (this.closed) return + const resolve = this.resolvers.shift() + if (resolve) { + resolve({ value, done: false }) + return + } + this.values.push(value) + } + + close(): void { + if (this.closed) return + this.closed = true + for (const resolve of this.resolvers.splice(0)) { + resolve({ value: undefined as T, done: true }) + } + } + + fail(error: unknown): void { + this.error = error + this.close() + } + + [Symbol.asyncIterator](): AsyncIterator { + return { + next: () => { + if (this.values.length > 0) { + return Promise.resolve({ value: this.values.shift()!, done: false }) + } + if (this.error) { + return Promise.reject(this.error) + } + if (this.closed) { + return Promise.resolve({ value: undefined as T, done: true }) + } + return new Promise>((resolve) => { + this.resolvers.push(resolve) + }) + }, + } + } +} + +function resolveWebSocket( + opts: OpenAIRealtimeProviderOptions +): OpenAIRealtimeWebSocketConstructor { + const ctor = opts.WebSocket ?? globalThis.WebSocket + if (!ctor) { + throw new Error( + `[agent-runtime] OpenAI realtime requires a WebSocket implementation` + ) + } + return ctor as unknown as OpenAIRealtimeWebSocketConstructor +} + +function onSocket( + ws: OpenAIRealtimeSocket, + event: string, + handler: (...args: Array) => void +): void { + if (ws.addEventListener) { + ws.addEventListener(event, handler) + return + } + ws.on?.(event, handler) +} + +function socketMessageData(args: Array): unknown { + const [first] = args + if (first && typeof first === `object` && `data` in first) { + return (first as { data: unknown }).data + } + return first +} + +function dataToString(data: unknown): string { + if (typeof data === `string`) return data + if (data instanceof ArrayBuffer) return new TextDecoder().decode(data) + if (data instanceof Uint8Array) return new TextDecoder().decode(data) + if ( + data && + typeof data === `object` && + `toString` in data && + typeof data.toString === `function` + ) { + return data.toString() + } + return String(data) +} + +function bytesToBase64(bytes: Uint8Array): string { + const bufferCtor = (globalThis as { Buffer?: typeof Buffer }).Buffer + if (bufferCtor) return bufferCtor.from(bytes).toString(`base64`) + let binary = `` + for (const byte of bytes) binary += String.fromCharCode(byte) + return btoa(binary) +} + +function base64ToBytes(value: string): Uint8Array { + const bufferCtor = (globalThis as { Buffer?: typeof Buffer }).Buffer + if (bufferCtor) return new Uint8Array(bufferCtor.from(value, `base64`)) + const binary = atob(value) + const bytes = new Uint8Array(binary.length) + for (let index = 0; index < binary.length; index += 1) { + bytes[index] = binary.charCodeAt(index) + } + return bytes +} + +function sendJson(ws: OpenAIRealtimeSocket, event: unknown): void { + ws.send(JSON.stringify(event)) +} + +function toolName(tool: AgentTool): string { + return tool.name +} + +function toOpenAITool(tool: AgentTool): Record { + return { + type: `function`, + name: tool.name, + description: tool.description, + parameters: tool.parameters, + } +} + +function messageContentText(content: unknown): string { + if (typeof content === `string`) return content + if (!Array.isArray(content)) return `` + return content + .map((part) => { + if (!part || typeof part !== `object`) return `` + const text = (part as { text?: unknown }).text + return typeof text === `string` ? text : `` + }) + .filter(Boolean) + .join(`\n`) +} + +function messageRole(message: LLMMessage): `user` | `assistant` | null { + const role = (message as { role?: unknown }).role + return role === `assistant` ? `assistant` : role === `user` ? `user` : null +} + +function sendConversationMessage( + ws: OpenAIRealtimeSocket, + message: LLMMessage +): void { + const role = messageRole(message) + if (!role) return + const text = messageContentText((message as { content?: unknown }).content) + if (!text) return + sendJson(ws, { + type: `conversation.item.create`, + item: { + type: `message`, + role, + content: [ + { + type: role === `assistant` ? `output_text` : `input_text`, + text, + }, + ], + }, + }) +} + +function realtimeFormat( + format: RealtimeAudioFormat | undefined +): Record | undefined { + if (!format) return undefined + return { + type: `audio/pcm`, + rate: format.sampleRate, + } +} + +function buildSessionUpdate( + opts: OpenAIRealtimeProviderOptions, + input: RealtimeProviderConnectInput +): Record { + const inputFormat = realtimeFormat(input.audio?.inputFormat) + const outputFormat = realtimeFormat(input.audio?.outputFormat) + return { + type: `session.update`, + session: { + type: `realtime`, + model: opts.model ?? `gpt-realtime-2`, + instructions: input.systemPrompt, + output_modalities: outputFormat ? [`audio`] : [`text`], + tool_choice: input.tools.length > 0 ? `auto` : `none`, + ...(input.tools.length > 0 + ? { tools: input.tools.map((tool) => toOpenAITool(tool)) } + : {}), + ...(inputFormat || outputFormat || opts.voice + ? { + audio: { + ...(inputFormat ? { input: { format: inputFormat } } : {}), + ...(outputFormat || opts.voice + ? { + output: { + ...(outputFormat ? { format: outputFormat } : {}), + ...(opts.voice ? { voice: opts.voice } : {}), + }, + } + : {}), + }, + } + : {}), + }, + } +} + +function parseToolArgs(value: unknown): unknown { + if (typeof value !== `string`) return value ?? {} + try { + return JSON.parse(value) as unknown + } catch { + return value + } +} + +function toolResultOutput(result: RealtimeToolResult): string { + if (typeof result.result === `string`) return result.result + return JSON.stringify(result.result) +} + +function mapOpenAIEvent( + event: OpenAIRealtimeEvent +): Array { + switch (event.type) { + case `session.created`: + return [{ type: `session.started`, sessionId: event.session?.id }] + case `session.updated`: + return [{ type: `session.updated` }] + case `error`: + return [ + { + type: `session.error`, + error: + typeof event.error?.message === `string` + ? event.error.message + : `OpenAI realtime error`, + code: + typeof event.error?.code === `string` + ? event.error.code + : undefined, + }, + ] + case `input_audio_buffer.speech_started`: + return [ + { + type: `input_audio.speech_started`, + audioOffset: + typeof event.audio_start_ms === `number` + ? String(event.audio_start_ms) + : undefined, + }, + ] + case `input_audio_buffer.speech_stopped`: + return [ + { + type: `input_audio.speech_stopped`, + audioOffset: + typeof event.audio_end_ms === `number` + ? String(event.audio_end_ms) + : undefined, + }, + ] + case `conversation.item.input_audio_transcription.delta`: + return [ + { + type: `input_transcript.delta`, + delta: String(event.delta ?? ``), + turnId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `conversation.item.input_audio_transcription.completed`: + return [ + { + type: `input_transcript.completed`, + text: String(event.transcript ?? ``), + turnId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `response.created`: + return [ + { + type: `response.started`, + responseId: + typeof event.response?.id === `string` + ? event.response.id + : undefined, + }, + ] + case `response.audio.delta`: + return [ + { + type: `output_audio.delta`, + audio: base64ToBytes(String(event.delta ?? ``)), + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + itemId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `response.audio.done`: + return [ + { + type: `output_audio.completed`, + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + itemId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `response.audio_transcript.delta`: + case `response.output_text.delta`: + return [ + { + type: `output_transcript.delta`, + delta: String(event.delta ?? ``), + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + }, + ] + case `response.audio_transcript.done`: + case `response.output_text.done`: + return [ + { + type: `output_transcript.completed`, + text: + typeof event.transcript === `string` + ? event.transcript + : typeof event.text === `string` + ? event.text + : undefined, + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + }, + ] + case `response.done`: + return [ + { + type: `response.completed`, + responseId: + typeof event.response?.id === `string` + ? event.response.id + : typeof event.response_id === `string` + ? event.response_id + : undefined, + }, + ] + case `response.cancelled`: + return [ + { + type: `response.cancelled`, + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + }, + ] + case `response.output_item.added`: + if (event.item?.type !== `function_call`) return [] + return [ + { + type: `tool_call.started`, + toolCallId: String(event.item.call_id ?? event.item.id ?? ``), + name: String(event.item.name ?? ``), + }, + ] + case `response.function_call_arguments.delta`: + return [ + { + type: `tool_call.arguments_delta`, + toolCallId: String(event.call_id ?? event.item_id ?? ``), + delta: String(event.delta ?? ``), + }, + ] + default: + return [] + } +} + +export function createOpenAIRealtimeProvider( + opts: OpenAIRealtimeProviderOptions +): RealtimeProviderConfig { + const model = opts.model ?? `gpt-realtime-2` + + return { + id: `openai`, + model, + async connect(input): Promise { + const apiKey = + typeof opts.apiKey === `function` ? await opts.apiKey() : opts.apiKey + if (!apiKey) { + throw new Error(`[agent-runtime] OpenAI realtime apiKey is required`) + } + + const WebSocketCtor = resolveWebSocket(opts) + const url = new URL(opts.url ?? `wss://api.openai.com/v1/realtime`) + url.searchParams.set(`model`, model) + const headers: Record = { + Authorization: `Bearer ${apiKey}`, + ...opts.headers, + } + if (opts.safetyIdentifier) { + headers[`OpenAI-Safety-Identifier`] = opts.safetyIdentifier + } + + const ws = new WebSocketCtor(url.toString(), { headers }) + const queue = new AsyncEventQueue() + const toolsByName = new Map( + input.tools.map((tool) => [toolName(tool), tool]) + ) + + const sendToolResult = async ( + result: RealtimeToolResult + ): Promise => { + sendJson(ws, { + type: `conversation.item.create`, + item: { + type: `function_call_output`, + call_id: result.toolCallId, + output: toolResultOutput(result), + }, + }) + sendJson(ws, { type: `response.create` }) + } + + const executeToolCall = async ( + event: OpenAIRealtimeEvent + ): Promise => { + const item = event.item ?? {} + const toolCallId = String( + event.call_id ?? item.call_id ?? item.id ?? event.item_id ?? `` + ) + const name = String(event.name ?? item.name ?? ``) + const args = parseToolArgs(event.arguments ?? item.arguments) + queue.push({ + type: `tool_call.arguments_completed`, + toolCallId, + name, + args, + }) + const tool = toolsByName.get(name) + if (!tool) { + const result: RealtimeToolResult = { + toolCallId, + name, + result: `Tool "${name}" is not available.`, + isError: true, + } + queue.push({ type: `tool_call.completed`, ...result }) + await sendToolResult(result) + return + } + + try { + const prepared = + typeof tool.prepareArguments === `function` + ? tool.prepareArguments(args) + : args + const result = await tool.execute( + toolCallId, + prepared as never, + input.signal + ) + const realtimeResult: RealtimeToolResult = { + toolCallId, + name, + result, + } + queue.push({ type: `tool_call.completed`, ...realtimeResult }) + await sendToolResult(realtimeResult) + } catch (error) { + const realtimeResult: RealtimeToolResult = { + toolCallId, + name, + result: error instanceof Error ? error.message : String(error), + isError: true, + } + queue.push({ type: `tool_call.completed`, ...realtimeResult }) + await sendToolResult(realtimeResult) + } + } + + const opened = new Promise((resolve, reject) => { + onSocket(ws, `open`, () => resolve()) + onSocket(ws, `error`, (event) => { + const error = + event instanceof Error + ? event + : new Error(`[agent-runtime] OpenAI realtime WebSocket error`) + queue.fail(error) + reject(error) + }) + }) + + onSocket(ws, `message`, (...args) => { + try { + const parsed = JSON.parse( + dataToString(socketMessageData(args)) + ) as OpenAIRealtimeEvent + if (parsed.type === `response.function_call_arguments.done`) { + void executeToolCall(parsed).catch((error) => queue.fail(error)) + return + } + for (const event of mapOpenAIEvent(parsed)) queue.push(event) + } catch (error) { + queue.fail(error) + } + }) + onSocket(ws, `close`, () => { + queue.push({ type: `session.closed` }) + queue.close() + }) + + await opened + sendJson(ws, buildSessionUpdate(opts, input)) + for (const message of input.messages) { + sendConversationMessage(ws, message) + } + + return { + events: queue, + appendInputAudio: async (chunk) => { + sendJson(ws, { + type: `input_audio_buffer.append`, + audio: bytesToBase64(chunk), + }) + }, + commitInputAudio: async () => { + sendJson(ws, { type: `input_audio_buffer.commit` }) + }, + sendText: async (text) => { + sendJson(ws, { + type: `conversation.item.create`, + item: { + type: `message`, + role: `user`, + content: [{ type: `input_text`, text }], + }, + }) + sendJson(ws, { type: `response.create` }) + }, + sendToolResult, + cancelResponse: async () => { + sendJson(ws, { type: `response.cancel` }) + }, + close: async (reason) => { + ws.close?.(1000, reason) + queue.close() + }, + } + }, + } +} diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts new file mode 100644 index 0000000000..de34722ea6 --- /dev/null +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -0,0 +1,217 @@ +import { Type } from '@sinclair/typebox' +import { describe, expect, it, vi } from 'vitest' +import { createOpenAIRealtimeProvider } from '../src/openai-realtime' +import type { AgentTool, RealtimeProviderEvent } from '../src/types' + +type Listener = (...args: Array) => void + +class FakeWebSocket { + static instances: Array = [] + + readonly sent: Array = [] + readonly listeners = new Map>() + + constructor( + readonly url: string, + readonly init?: unknown + ) { + FakeWebSocket.instances.push(this) + queueMicrotask(() => this.emit(`open`)) + } + + addEventListener(event: string, listener: Listener): void { + const listeners = this.listeners.get(event) ?? [] + listeners.push(listener) + this.listeners.set(event, listeners) + } + + send(data: string): void { + this.sent.push(JSON.parse(data) as unknown) + } + + close(): void { + this.emit(`close`) + } + + emit(event: string, payload?: unknown): void { + for (const listener of this.listeners.get(event) ?? []) { + listener(payload) + } + } + + emitMessage(payload: unknown): void { + this.emit(`message`, { data: JSON.stringify(payload) }) + } +} + +function nextEvent(iterator: AsyncIterator) { + return iterator.next().then((result) => result.value) +} + +describe(`createOpenAIRealtimeProvider`, () => { + it(`connects over WebSocket and configures session state`, async () => { + FakeWebSocket.instances = [] + const tool: AgentTool = { + name: `lookup`, + label: `Lookup`, + description: `Look up a value`, + parameters: Type.Object({ q: Type.String() }), + execute: vi.fn(), + } + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + safetyIdentifier: `user-1`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `You are Horton.`, + messages: [{ role: `user`, content: `Previous context` } as never], + tools: [tool], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + outputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.url).toBe( + `wss://api.openai.com/v1/realtime?model=gpt-realtime-2` + ) + expect(socket.init).toEqual({ + headers: { + Authorization: `Bearer sk-test`, + 'OpenAI-Safety-Identifier': `user-1`, + }, + }) + expect(socket.sent[0]).toMatchObject({ + type: `session.update`, + session: { + type: `realtime`, + model: `gpt-realtime-2`, + instructions: `You are Horton.`, + output_modalities: [`audio`], + tool_choice: `auto`, + tools: [ + { + type: `function`, + name: `lookup`, + description: `Look up a value`, + }, + ], + audio: { + input: { format: { type: `audio/pcm`, rate: 24_000 } }, + output: { format: { type: `audio/pcm`, rate: 24_000 } }, + }, + }, + }) + expect(socket.sent[1]).toEqual({ + type: `conversation.item.create`, + item: { + type: `message`, + role: `user`, + content: [{ type: `input_text`, text: `Previous context` }], + }, + }) + }) + + it(`sends audio input chunks as OpenAI input buffer events`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + + await session.appendInputAudio?.(new Uint8Array([1, 2, 3])) + await session.commitInputAudio?.() + + expect(socket.sent.at(-2)).toEqual({ + type: `input_audio_buffer.append`, + audio: `AQID`, + }) + expect(socket.sent.at(-1)).toEqual({ type: `input_audio_buffer.commit` }) + }) + + it(`maps OpenAI events and executes function calls`, async () => { + FakeWebSocket.instances = [] + const execute = vi.fn().mockResolvedValue({ + content: [{ type: `text`, text: `done` }], + details: { ok: true }, + }) + const tool: AgentTool = { + name: `lookup`, + label: `Lookup`, + description: `Look up a value`, + parameters: Type.Object({ q: Type.String() }), + execute, + } + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [tool], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ type: `session.created`, session: { id: `sess-1` } }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `session.started`, + sessionId: `sess-1`, + }) + + socket.emitMessage({ + type: `response.output_item.added`, + item: { + type: `function_call`, + id: `fc-1`, + call_id: `call-1`, + name: `lookup`, + }, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `tool_call.started`, + toolCallId: `call-1`, + name: `lookup`, + }) + + socket.emitMessage({ + type: `response.function_call_arguments.done`, + call_id: `call-1`, + name: `lookup`, + arguments: JSON.stringify({ q: `status` }), + }) + + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `tool_call.arguments_completed`, + toolCallId: `call-1`, + name: `lookup`, + args: { q: `status` }, + }) + await expect(nextEvent(iterator)).resolves.toMatchObject({ + type: `tool_call.completed`, + toolCallId: `call-1`, + name: `lookup`, + }) + expect(execute).toHaveBeenCalledWith(`call-1`, { q: `status` }, undefined) + expect(socket.sent.at(-2)).toMatchObject({ + type: `conversation.item.create`, + item: { + type: `function_call_output`, + call_id: `call-1`, + }, + }) + expect(socket.sent.at(-1)).toEqual({ type: `response.create` }) + }) +}) From 6134e04038278efab51f40c249bc7273dbe524f1 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 12:03:49 +0100 Subject: [PATCH 06/31] feat(agents-runtime): bridge realtime durable streams --- .../agents-runtime/src/context-factory.ts | 184 ++++++++++++++++++ packages/agents-runtime/src/process-wake.ts | 4 + .../test/helpers/context-test-helpers.ts | 5 + .../test/realtime-context.test.ts | 109 ++++++++++- 4 files changed, 301 insertions(+), 1 deletion(-) diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 379a9c54c5..6f569d5cd7 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -1,4 +1,5 @@ import { queryOnce } from '@durable-streams/state/db' +import { DurableStream } from '@durable-streams/client' import { assembleContext } from './context-assembly' import { createContextEntriesApi } from './context-entries' import { entityStateSchema } from './entity-schema' @@ -13,6 +14,7 @@ import { getCronStreamPath } from './cron-utils' import { runtimeLog } from './log' import { sliceChars } from './token-budget' import { createContextTools } from './tools/context-tools' +import { appendPathToUrl } from './url' import { CACHE_TIERS } from './types' import { composeToolsWithProviders } from './tool-providers' import { validateSlashCommandDefinitions } from './composer-input' @@ -107,6 +109,176 @@ function applyRealtimeToolPolicy( }) } +type RealtimeStreamConfig = NonNullable +type RealtimeControlInput = + | { type: `input_text`; text: string } + | { type: `input_audio.commit` } + | { type: `response.cancel` } + | { type: `session.close`; reason?: string } +type RealtimeStreamIo = { + writeProviderEvent: (event: RealtimeProviderEvent) => Promise + close: () => Promise +} + +function isRealtimeControlInput(value: unknown): value is RealtimeControlInput { + if (!value || typeof value !== `object`) return false + const type = (value as { type?: unknown }).type + return ( + type === `input_text` || + type === `input_audio.commit` || + type === `response.cancel` || + type === `session.close` + ) +} + +function realtimeDurableStream( + streams: RealtimeStreamConfig, + path: string, + contentType: string +): DurableStream { + return new DurableStream({ + url: appendPathToUrl(streams.baseUrl, path), + headers: streams.headers, + contentType, + batching: false, + }) +} + +function jsonBytes(value: unknown): Uint8Array { + return new TextEncoder().encode(JSON.stringify(value)) +} + +function realtimeControlOutput(event: RealtimeProviderEvent): unknown { + if (event.type !== `output_audio.delta`) return event + return { + type: event.type, + responseId: event.responseId, + itemId: event.itemId, + byteLength: event.audio.byteLength, + } +} + +function createRealtimeStreamIo( + config: HandlerContextConfig, + session: ManifestRealtimeSessionEntry | undefined, + providerSession: RealtimeProviderSession +): RealtimeStreamIo | undefined { + if (!config.realtimeStreams || !session) return undefined + + const abort = new AbortController() + const abortFromRun = (): void => abort.abort() + if (config.runSignal?.aborted) { + abort.abort() + } else { + config.runSignal?.addEventListener(`abort`, abortFromRun, { once: true }) + } + + const audioIn = realtimeDurableStream( + config.realtimeStreams, + session.streams.audio_in, + `audio/pcm` + ) + const audioOut = realtimeDurableStream( + config.realtimeStreams, + session.streams.audio_out, + `audio/pcm` + ) + const controlIn = realtimeDurableStream( + config.realtimeStreams, + session.streams.control_in, + `application/json` + ) + const controlOut = realtimeDurableStream( + config.realtimeStreams, + session.streams.control_out, + `application/json` + ) + const tasks: Array> = [] + + if (providerSession.appendInputAudio) { + tasks.push( + (async () => { + const response = await audioIn.stream({ + live: true, + signal: abort.signal, + warnOnHttp: false, + }) + try { + for await (const chunk of response.bodyStream()) { + if (abort.signal.aborted) break + await providerSession.appendInputAudio?.(chunk) + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + runtimeLog.warn( + `[agent-runtime] realtime audio/in pump failed:`, + error + ) + } + }) + ) + } + + tasks.push( + (async () => { + const response = await controlIn.stream({ + live: true, + signal: abort.signal, + json: true, + warnOnHttp: false, + }) + try { + for await (const command of response.jsonStream()) { + if (abort.signal.aborted || !isRealtimeControlInput(command)) { + continue + } + switch (command.type) { + case `input_text`: + await providerSession.sendText?.(command.text) + break + case `input_audio.commit`: + await providerSession.commitInputAudio?.() + break + case `response.cancel`: + await providerSession.cancelResponse?.() + break + case `session.close`: + await providerSession.close?.(command.reason) + abort.abort() + break + } + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + runtimeLog.warn( + `[agent-runtime] realtime control/in pump failed:`, + error + ) + } + }) + ) + + return { + async writeProviderEvent(event) { + if (event.type === `output_audio.delta`) { + await audioOut.append(event.audio) + } + await controlOut.append(jsonBytes(realtimeControlOutput(event))) + }, + async close() { + abort.abort() + config.runSignal?.removeEventListener(`abort`, abortFromRun) + await Promise.allSettled(tasks) + }, + } +} + const MAX_HYDRATED_IMAGE_ATTACHMENTS = 4 const MAX_HYDRATED_IMAGE_ATTACHMENT_BYTES = 10 * 1024 * 1024 @@ -138,6 +310,10 @@ export interface HandlerContextConfig { }) => void | Promise ) => void hydratedEventSourceWake?: HydratedEventSourceWake | null + realtimeStreams?: { + baseUrl: string + headers?: Record + } doObserve: ( source: ObservationSource, wake?: Wake @@ -1016,6 +1192,7 @@ export function createHandlerContext( const messages = await hydrateAttachmentBlocks( timelineToMessages(config.db) ) + let realtimeIo: RealtimeStreamIo | undefined async function handleProviderEvent( event: RealtimeProviderEvent @@ -1132,11 +1309,17 @@ export function createHandlerContext( session: activeRealtimeSession(), signal: config.runSignal, }) + realtimeIo = createRealtimeStreamIo( + config, + activeRealtimeSession(), + activeRealtimeProviderSession + ) for await (const event of activeRealtimeProviderSession.events) { if (config.runSignal?.aborted) { break } + await realtimeIo?.writeProviderEvent(event) await handleProviderEvent(event) } } @@ -1158,6 +1341,7 @@ export function createHandlerContext( bridge.onRunEnd({ finishReason: `error` }) throw error } finally { + await realtimeIo?.close() activeRealtimeProviderSession = null } diff --git a/packages/agents-runtime/src/process-wake.ts b/packages/agents-runtime/src/process-wake.ts index 623f914b20..dc51203dd3 100644 --- a/packages/agents-runtime/src/process-wake.ts +++ b/packages/agents-runtime/src/process-wake.ts @@ -2098,6 +2098,10 @@ export async function processWake( activeSignalHandler = handler }, hydratedEventSourceWake: await hydrateCurrentEventSourceWake(), + realtimeStreams: { + baseUrl, + headers: serverHeaders, + }, doObserve, doSpawn, doFork, diff --git a/packages/agents-runtime/test/helpers/context-test-helpers.ts b/packages/agents-runtime/test/helpers/context-test-helpers.ts index 2aa60e55fd..78faad4d6d 100644 --- a/packages/agents-runtime/test/helpers/context-test-helpers.ts +++ b/packages/agents-runtime/test/helpers/context-test-helpers.ts @@ -304,6 +304,10 @@ export function createTestHandlerContext( wakeEvent?: WakeEvent hydratedEventSourceWake?: HydratedEventSourceWake | null prepareAgentRun?: () => Promise + realtimeStreams?: { + baseUrl: string + headers?: Record + } } = {} ) { const db = opts.db ?? buildStreamFixture([]) @@ -334,6 +338,7 @@ export function createTestHandlerContext( payload: `hi`, }, hydratedEventSourceWake: opts.hydratedEventSourceWake, + realtimeStreams: opts.realtimeStreams, prepareAgentRun: opts.prepareAgentRun, doObserve: vi.fn(), doSpawn: vi.fn(), diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index 9e5ebfac2d..a16c88ccb1 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -1,8 +1,37 @@ -import { describe, expect, it } from 'vitest' +import { beforeEach, describe, expect, it, vi } from 'vitest' import { createTestRealtimeProvider } from '../src/realtime' import { createTestHandlerContext } from './helpers/context-test-helpers' +const durableMock = vi.hoisted(() => { + const appends: Array<{ url: string; data: unknown }> = [] + class DurableStream { + constructor(readonly opts: { url: string }) {} + + async append(data: unknown): Promise { + appends.push({ url: this.opts.url, data }) + } + + async stream() { + return { + bodyStream: async function* () {}, + jsonStream: async function* () {}, + cancel: vi.fn(), + } + } + } + + return { appends, DurableStream } +}) + +vi.mock(`@durable-streams/client`, () => ({ + DurableStream: durableMock.DurableStream, +})) + describe(`ctx.useRealtime()`, () => { + beforeEach(() => { + durableMock.appends.length = 0 + }) + it(`records provider transcript output through the outbound bridge`, async () => { const { ctx } = createTestHandlerContext() @@ -62,4 +91,82 @@ describe(`ctx.useRealtime()`, () => { status: `active`, }) }) + + it(`persists provider audio and control output to realtime durable streams`, async () => { + const { ctx } = createTestHandlerContext({ + realtimeStreams: { + baseUrl: `http://server.test`, + headers: { authorization: `Bearer claim` }, + }, + }) + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/test/entity/realtime/rt-1/audio/in`, + audio_out: `/test/entity/realtime/rt-1/audio/out`, + control_in: `/test/entity/realtime/rt-1/control/in`, + control_out: `/test/entity/realtime/rt-1/control/out`, + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `rt-1` }, + { + type: `output_audio.delta`, + audio: new Uint8Array([1, 2, 3]), + responseId: `resp-1`, + itemId: `item-1`, + }, + { type: `output_audio.completed`, responseId: `resp-1` }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await realtime.run() + + expect(durableMock.appends).toEqual([ + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/audio/out`, + data: new Uint8Array([1, 2, 3]), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + ]) + const decoder = new TextDecoder() + expect( + JSON.parse(decoder.decode(durableMock.appends[2]!.data as Uint8Array)) + ).toEqual({ + type: `output_audio.delta`, + responseId: `resp-1`, + itemId: `item-1`, + byteLength: 3, + }) + }) }) From cd7747a24278bfc15ca4925500c41465d0a8a2e8 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 12:07:05 +0100 Subject: [PATCH 07/31] feat(agents): route horton realtime sessions --- packages/agents/src/agents/horton.ts | 81 +++++++++++++++-- .../test/horton-tool-composition.test.ts | 90 +++++++++++++++++++ 2 files changed, 162 insertions(+), 9 deletions(-) diff --git a/packages/agents/src/agents/horton.ts b/packages/agents/src/agents/horton.ts index fcaadd8471..9033a6b3ee 100644 --- a/packages/agents/src/agents/horton.ts +++ b/packages/agents/src/agents/horton.ts @@ -17,6 +17,7 @@ import type { AgentTool, StreamFn } from '@mariozechner/pi-agent-core' import { buildSkillSlashCommands, createContextSkillLoader, + createOpenAIRealtimeProvider, completeWithLowCostModel, } from '@electric-ax/agents-runtime' import type { @@ -46,6 +47,15 @@ const TITLE_USER_PROMPT = (userMessage: string): string => `User request:\n${userMessage}` const TITLE_GENERATION_TIMEOUT_MS = 8_000 const HORTON_SKILLS_SLASH_COMMAND_OWNER = `horton:skills` +const HORTON_REALTIME_DIRECT_TOOLS = new Set([ + `web_search`, + `fetch_url`, + `spawn_worker`, + `send`, + `search_electric_agents_docs`, + `use_skill`, + `remove_skill`, +]) const TITLE_STOP_WORDS = new Set([ `a`, @@ -322,6 +332,25 @@ function getToolName(tool: unknown): string | null { return typeof name === `string` ? name : null } +function hortonRealtimeDirectTools(tools: Array): Array { + return tools + .map((tool) => getToolName(tool)) + .filter( + (name): name is string => + name !== null && HORTON_REALTIME_DIRECT_TOOLS.has(name) + ) +} + +function hortonRealtimeSystemPrompt(systemPrompt: string): string { + return `${systemPrompt} + +# Realtime mode +You are speaking with the user live. Keep responses concise enough for voice. +Prefer dispatching workers for coding, shell, edit, or other long-running tasks. +Use direct tools only for lightweight orchestration, lookup, context loading, and sending messages. +When a task may change files, run commands, or take more than a short exchange, spawn a worker and tell the user you are handing it off.` +} + export function createHortonTools( sandbox: Sandbox, ctx: HandlerContext, @@ -665,16 +694,50 @@ function createAssistantHandler(options: { }) } + const systemPrompt = buildHortonSystemPrompt(sandboxCwd, { + hasDocsSupport: Boolean(docsSupport), + hasSkills, + docsUrl, + modelProvider: modelConfig.provider, + modelId: String(modelConfig.model), + hasEventSourceTools, + hasScheduleTools, + }) + const activeRealtimeSession = ctx.realtime?.activeSession?.() + if (activeRealtimeSession) { + if (activeRealtimeSession.provider !== `openai`) { + throw new Error( + `Horton realtime currently supports provider "openai", got "${activeRealtimeSession.provider}"` + ) + } + const apiKey = process.env.OPENAI_API_KEY + if (!apiKey) { + throw new Error( + `OPENAI_API_KEY must be set before starting Horton realtime mode` + ) + } + const realtime = ctx.useRealtime({ + systemPrompt: hortonRealtimeSystemPrompt(systemPrompt), + provider: createOpenAIRealtimeProvider({ + apiKey, + model: activeRealtimeSession.model, + }), + tools: tools as AgentTool[], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + outputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + }, + toolPolicy: { + direct: hortonRealtimeDirectTools(tools as AgentTool[]), + }, + }) + await realtime.run() + await titlePromise + return + } + ctx.useAgent({ - systemPrompt: buildHortonSystemPrompt(sandboxCwd, { - hasDocsSupport: Boolean(docsSupport), - hasSkills, - docsUrl, - modelProvider: modelConfig.provider, - modelId: String(modelConfig.model), - hasEventSourceTools, - hasScheduleTools, - }), + systemPrompt, ...modelConfig, // mcp.tools() inserts sentinel objects that the runtime's // composeToolsWithProviders resolves at wake time. The static type of diff --git a/packages/agents/test/horton-tool-composition.test.ts b/packages/agents/test/horton-tool-composition.test.ts index 061926d46d..8c05e003b4 100644 --- a/packages/agents/test/horton-tool-composition.test.ts +++ b/packages/agents/test/horton-tool-composition.test.ts @@ -133,6 +133,96 @@ describe(`horton tool composition`, () => { await expect(extractFirstUserMessage(ctx)).resolves.toBe(`first`) }) + it(`uses realtime mode as an OpenAI orchestrator when a session is active`, async () => { + const registry = createEntityRegistry() + registerHorton(registry, { workingDirectory: `/tmp`, modelCatalog }) + const previousOpenAIKey = process.env.OPENAI_API_KEY + process.env.OPENAI_API_KEY = `sk-test` + const realtimeRun = vi.fn(async () => {}) + const useRealtime = vi.fn(() => ({ run: realtimeRun })) + const useAgent = vi.fn(() => ({ run: vi.fn(async () => {}) })) + const fakeCtx = { + args: {}, + electricTools: [], + events: [], + firstWake: false, + tags: { title: `Existing title` }, + db: { collections: { inbox: { toArray: [] } } }, + sandbox: { + workingDirectory: `/work`, + readFile: vi.fn(async () => { + throw new Error(`ENOENT`) + }), + }, + slashCommands: { replaceOwned: vi.fn() }, + insertContext: vi.fn(), + removeContext: vi.fn(), + getContext: vi.fn(), + useContext: vi.fn(), + useAgent, + useRealtime, + realtime: { + activeSession: () => ({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + retention: `forever`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + audio_out: `/horton/demo/realtime/rt-1/audio/out`, + control_in: `/horton/demo/realtime/rt-1/control/in`, + control_out: `/horton/demo/realtime/rt-1/control/out`, + }, + }), + }, + } as any + + try { + await registry + .get(`horton`)! + .definition.handler(fakeCtx, { type: `inbox` } as any) + } finally { + if (previousOpenAIKey === undefined) { + delete process.env.OPENAI_API_KEY + } else { + process.env.OPENAI_API_KEY = previousOpenAIKey + } + } + + expect(useAgent).not.toHaveBeenCalled() + expect(useRealtime).toHaveBeenCalledTimes(1) + expect(realtimeRun).toHaveBeenCalledTimes(1) + const realtimeConfig = ( + useRealtime.mock.calls as unknown as Array< + [ + { + provider: { id: string; model: string } + toolPolicy: { direct: Array } + }, + ] + > + )[0]![0] + expect(realtimeConfig.provider).toMatchObject({ + id: `openai`, + model: `gpt-realtime-2`, + }) + expect(realtimeConfig.toolPolicy.direct).toEqual( + expect.arrayContaining([ + `web_search`, + `fetch_url`, + `spawn_worker`, + `send`, + ]) + ) + expect(realtimeConfig.toolPolicy.direct).not.toEqual( + expect.arrayContaining([`bash`, `read`, `write`, `edit`]) + ) + }) + it(`orders title candidates with the _seq fallback convention`, async () => { const ctx = { db: { From 6b41d7175be02b87e3238325b1d55b11a23bad5e Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 12:10:08 +0100 Subject: [PATCH 08/31] feat(agents-ui): add realtime voice toggle --- .../src/components/MessageInput.module.css | 5 + .../src/components/MessageInput.tsx | 91 +++++++- .../src/lib/realtime-audio.ts | 213 ++++++++++++++++++ 3 files changed, 299 insertions(+), 10 deletions(-) create mode 100644 packages/agents-server-ui/src/lib/realtime-audio.ts diff --git a/packages/agents-server-ui/src/components/MessageInput.module.css b/packages/agents-server-ui/src/components/MessageInput.module.css index a9fa5e06f7..5c2fbccf24 100644 --- a/packages/agents-server-ui/src/components/MessageInput.module.css +++ b/packages/agents-server-ui/src/components/MessageInput.module.css @@ -63,6 +63,11 @@ color: var(--ds-text-1); } +.inlineIconButton.voiceActive { + background: var(--ds-accent-a3); + color: var(--ds-accent-11); +} + .inlineIconButton:focus-visible { outline: 2px solid var(--ds-accent-a6); outline-offset: -2px; diff --git a/packages/agents-server-ui/src/components/MessageInput.tsx b/packages/agents-server-ui/src/components/MessageInput.tsx index 74e79243ec..46ab078c9a 100644 --- a/packages/agents-server-ui/src/components/MessageInput.tsx +++ b/packages/agents-server-ui/src/components/MessageInput.tsx @@ -1,5 +1,5 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react' -import { ArrowUp, Square } from 'lucide-react' +import { ArrowUp, Mic, MicOff, Square } from 'lucide-react' import { useLiveQuery } from '@tanstack/react-db' import type { EntityStreamDBWithActions } from '@electric-ax/agents-runtime/client' import { @@ -10,6 +10,10 @@ import { readTextPayload, } from '../lib/sendMessage' import { serializeComposerInput } from '@electric-ax/agents-runtime/client' +import { + startRealtimeAudioSession, + type RealtimeAudioSession, +} from '../lib/realtime-audio' import { ComposerEditor } from './ComposerEditor' import { ComposerShell } from './ComposerShell' import { Icon, Stack, Text, Tooltip } from '../ui' @@ -85,6 +89,9 @@ export function MessageInput({ key: string originalText: string } | null>(null) + const [realtimePending, setRealtimePending] = useState(false) + const [realtimeActive, setRealtimeActive] = useState(false) + const realtimeSessionRef = useRef(null) const composerFocusRef = useRef<{ focus: () => void } | null>(null) const inputDisabled = disabled || writeDisabled const attachmentsDisabled = @@ -164,6 +171,14 @@ export function MessageInput({ attachmentCount === 0 && !disabled const canStop = showStop && !stopPending && !stopDisabled + const canUseRealtime = !inputDisabled && !editingMessage && Boolean(baseUrl) + + useEffect(() => { + return () => { + void realtimeSessionRef.current?.stop() + realtimeSessionRef.current = null + } + }, []) const handleSubmit = useCallback( (composerPayload?: ComposerInputPayload) => { @@ -221,6 +236,37 @@ export function MessageInput({ handleSubmit() }, [canStop, handleSubmit, onStop]) + const handleRealtimeToggle = useCallback(() => { + if (realtimePending) return + setError(null) + if (realtimeSessionRef.current) { + const session = realtimeSessionRef.current + realtimeSessionRef.current = null + setRealtimePending(true) + session + .stop() + .catch((err: Error) => setError(err.message)) + .finally(() => { + setRealtimeActive(false) + setRealtimePending(false) + }) + return + } + if (!canUseRealtime) return + setRealtimePending(true) + startRealtimeAudioSession({ baseUrl, entityUrl }) + .then((session) => { + realtimeSessionRef.current = session + setRealtimeActive(true) + }) + .catch((err: Error) => { + setError(err.message) + }) + .finally(() => { + setRealtimePending(false) + }) + }, [baseUrl, canUseRealtime, entityUrl, realtimePending]) + const startEditing = useCallback( (message: EntityTimelineData[`inbox`][number]) => { if (inputDisabled) return @@ -349,15 +395,40 @@ export function MessageInput({ ) : null } controls={ - imageAttachmentsEnabled ? ( - - ) : null + <> + + + + + + {imageAttachmentsEnabled ? ( + + ) : null} + } send={ diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts new file mode 100644 index 0000000000..c5a605b8da --- /dev/null +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -0,0 +1,213 @@ +import { DurableStream } from '@durable-streams/client' +import { appendPathToUrl } from '@electric-ax/agents-runtime/client' +import { serverFetch, getConfiguredServerHeaders } from './auth-fetch' + +export type RealtimeAudioSession = { + sessionId: string + stop: () => Promise +} + +type RealtimeSessionCreateResult = { + sessionId: string + streams: { + audio_in: string + audio_out: string + control_in: string + control_out: string + } +} + +const REALTIME_SAMPLE_RATE = 24_000 + +function realtimeUrl(baseUrl: string): string { + return appendPathToUrl(baseUrl, `/_electric/realtime/sessions`) +} + +function streamUrl(baseUrl: string, streamPath: string): string { + return appendPathToUrl(baseUrl, streamPath) +} + +function pcm16Bytes(input: Float32Array): Uint8Array { + const bytes = new Uint8Array(input.length * 2) + const view = new DataView(bytes.buffer) + for (let index = 0; index < input.length; index += 1) { + const sample = Math.max(-1, Math.min(1, input[index] ?? 0)) + view.setInt16( + index * 2, + sample < 0 ? sample * 0x8000 : sample * 0x7fff, + true + ) + } + return bytes +} + +function pcm16Floats(bytes: Uint8Array): Float32Array { + const view = new DataView(bytes.buffer, bytes.byteOffset, bytes.byteLength) + const output = new Float32Array(Math.floor(bytes.byteLength / 2)) + for (let index = 0; index < output.length; index += 1) { + output[index] = view.getInt16(index * 2, true) / 0x8000 + } + return output +} + +function streamHandle( + baseUrl: string, + path: string, + contentType: string +): DurableStream { + const url = streamUrl(baseUrl, path) + return new DurableStream({ + url, + headers: getConfiguredServerHeaders(url), + contentType, + batching: false, + }) +} + +function createAudioContext(): AudioContext { + return new AudioContext({ sampleRate: REALTIME_SAMPLE_RATE }) +} + +async function createRealtimeSession( + baseUrl: string, + entityUrl: string +): Promise { + const response = await serverFetch(realtimeUrl(baseUrl), { + method: `POST`, + headers: { 'content-type': `application/json` }, + body: JSON.stringify({ + entityUrl, + provider: `openai`, + model: `gpt-realtime-2`, + inputAudio: { + codec: `pcm16`, + sampleRate: REALTIME_SAMPLE_RATE, + channels: 1, + }, + outputAudio: { + codec: `pcm16`, + sampleRate: REALTIME_SAMPLE_RATE, + channels: 1, + }, + meta: { source: `agents-server-ui` }, + }), + }) + if (!response.ok) { + throw new Error( + `Failed to start realtime session (${response.status}): ${await response.text()}` + ) + } + return (await response.json()) as RealtimeSessionCreateResult +} + +export async function startRealtimeAudioSession({ + baseUrl, + entityUrl, +}: { + baseUrl: string + entityUrl: string +}): Promise { + const session = await createRealtimeSession(baseUrl, entityUrl) + const abort = new AbortController() + const micContext = createAudioContext() + const playbackContext = createAudioContext() + const media = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: REALTIME_SAMPLE_RATE, + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }) + const audioIn = streamHandle( + baseUrl, + session.streams.audio_in, + `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` + ) + const audioOut = streamHandle( + baseUrl, + session.streams.audio_out, + `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` + ) + const controlIn = streamHandle( + baseUrl, + session.streams.control_in, + `application/json` + ) + + const source = micContext.createMediaStreamSource(media) + const processor = micContext.createScriptProcessor(1024, 1, 1) + const silentOutput = micContext.createGain() + silentOutput.gain.value = 0 + let appendQueue = Promise.resolve() + processor.onaudioprocess = (event) => { + if (abort.signal.aborted) return + const input = event.inputBuffer.getChannelData(0) + const bytes = pcm16Bytes(input) + appendQueue = appendQueue + .then(() => audioIn.append(bytes)) + .catch((error) => { + console.warn(`[realtime-audio] microphone append failed`, error) + }) + } + source.connect(processor) + processor.connect(silentOutput) + silentOutput.connect(micContext.destination) + + let nextPlaybackTime = playbackContext.currentTime + const playback = (async () => { + const response = await audioOut.stream({ + live: true, + signal: abort.signal, + warnOnHttp: false, + }) + try { + for await (const chunk of response.bodyStream()) { + if (abort.signal.aborted || chunk.byteLength === 0) continue + const samples = pcm16Floats(chunk) + const buffer = playbackContext.createBuffer( + 1, + samples.length, + REALTIME_SAMPLE_RATE + ) + const channel = new Float32Array(samples.length) + channel.set(samples) + buffer.copyToChannel(channel, 0) + const node = playbackContext.createBufferSource() + node.buffer = buffer + node.connect(playbackContext.destination) + const startAt = Math.max(playbackContext.currentTime, nextPlaybackTime) + node.start(startAt) + nextPlaybackTime = startAt + buffer.duration + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + console.warn(`[realtime-audio] playback stream failed`, error) + } + }) + + return { + sessionId: session.sessionId, + async stop() { + abort.abort() + processor.disconnect() + silentOutput.disconnect() + source.disconnect() + for (const track of media.getTracks()) track.stop() + await appendQueue.catch(() => undefined) + await controlIn + .append( + new TextEncoder().encode( + JSON.stringify({ type: `session.close`, reason: `client-stop` }) + ) + ) + .catch(() => undefined) + await playback + await Promise.allSettled([micContext.close(), playbackContext.close()]) + }, + } +} From ff1b1ebd30e7a01df72c5d96cddee1425a1d2d9f Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 12:10:50 +0100 Subject: [PATCH 09/31] feat(agents-ui): route realtime text input --- .../agents-server-ui/src/components/MessageInput.tsx | 10 ++++++++++ packages/agents-server-ui/src/lib/realtime-audio.ts | 6 ++++++ 2 files changed, 16 insertions(+) diff --git a/packages/agents-server-ui/src/components/MessageInput.tsx b/packages/agents-server-ui/src/components/MessageInput.tsx index 46ab078c9a..599b04d9e2 100644 --- a/packages/agents-server-ui/src/components/MessageInput.tsx +++ b/packages/agents-server-ui/src/components/MessageInput.tsx @@ -186,6 +186,16 @@ export function MessageInput({ setError(null) const text = value.trim() const files = imageAttachmentsEnabled ? attachments : [] + if (realtimeSessionRef.current && !editingMessage && files.length === 0) { + const session = realtimeSessionRef.current + setValue(``) + onSend?.() + session.sendText(text).catch((err: Error) => { + setError(err.message) + setValue((current) => (current ? current : text)) + }) + return + } const tx = editingMessage ? updateAction?.({ key: editingMessage.key, diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index c5a605b8da..524868dfe4 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -4,6 +4,7 @@ import { serverFetch, getConfiguredServerHeaders } from './auth-fetch' export type RealtimeAudioSession = { sessionId: string + sendText: (text: string) => Promise stop: () => Promise } @@ -192,6 +193,11 @@ export async function startRealtimeAudioSession({ return { sessionId: session.sessionId, + async sendText(text: string) { + await controlIn.append( + new TextEncoder().encode(JSON.stringify({ type: `input_text`, text })) + ) + }, async stop() { abort.abort() processor.disconnect() From 1ac844459cb337bcab2487747396d362aa8b27a9 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 13:00:53 +0100 Subject: [PATCH 10/31] fix(agents): harden realtime session lifecycle --- .../agents-runtime/src/context-factory.ts | 81 ++++- .../agents-runtime/src/openai-realtime.ts | 52 ++- .../test/openai-realtime.test.ts | 56 +++- .../test/realtime-context.test.ts | 90 +++++ .../src/lib/realtime-audio.ts | 310 ++++++++++++------ packages/agents-server/src/entity-manager.ts | 7 + ...ic-agents-manager-write-validation.test.ts | 15 + packages/agents/src/agents/horton.ts | 16 +- 8 files changed, 516 insertions(+), 111 deletions(-) diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 6f569d5cd7..622b15ee45 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -114,6 +114,7 @@ type RealtimeControlInput = | { type: `input_text`; text: string } | { type: `input_audio.commit` } | { type: `response.cancel` } + | { type: `output_audio.truncate`; itemId: string; audioEndMs: number } | { type: `session.close`; reason?: string } type RealtimeStreamIo = { writeProviderEvent: (event: RealtimeProviderEvent) => Promise @@ -123,6 +124,12 @@ type RealtimeStreamIo = { function isRealtimeControlInput(value: unknown): value is RealtimeControlInput { if (!value || typeof value !== `object`) return false const type = (value as { type?: unknown }).type + if (type === `output_audio.truncate`) { + return ( + typeof (value as { itemId?: unknown }).itemId === `string` && + typeof (value as { audioEndMs?: unknown }).audioEndMs === `number` + ) + } return ( type === `input_text` || type === `input_audio.commit` || @@ -245,6 +252,12 @@ function createRealtimeStreamIo( case `response.cancel`: await providerSession.cancelResponse?.() break + case `output_audio.truncate`: + await providerSession.truncateOutputAudio?.({ + itemId: command.itemId, + audioEndMs: command.audioEndMs, + }) + break case `session.close`: await providerSession.close?.(command.reason) abort.abort() @@ -747,6 +760,57 @@ export function createHandlerContext( return realtimeSessions().filter(realtimeManifestIsActive).at(-1) } + async function updateRealtimeSessionStatus( + session: ManifestRealtimeSessionEntry | undefined, + status: `active` | `closed` | `failed`, + opts: { reason?: string; error?: string } = {} + ): Promise { + if (!session) return + + const key = session.key ?? `realtime-session:${session.id}` + const terminal = status === `closed` || status === `failed` + const endedAt = terminal ? new Date().toISOString() : session.endedAt + const meta = { + ...(session.meta ?? {}), + ...(opts.reason ? { reason: opts.reason } : {}), + ...(opts.error ? { error: opts.error } : {}), + } + + const nextSession: ManifestRealtimeSessionEntry = { + key, + kind: `realtime-session`, + id: session.id, + provider: session.provider, + model: session.model, + status, + startedAt: session.startedAt, + endedAt: endedAt ?? null, + streams: session.streams, + retention: `forever`, + ...(Object.keys(meta).length > 0 ? { meta } : {}), + } + + config.wakeSession.registerManifestEntry(nextSession) + config.writeEvent( + entityStateSchema.realtimeSessions.update({ + key, + value: { + session_id: session.id, + provider: session.provider, + model: session.model, + status, + started_at: session.startedAt, + ...(endedAt ? { ended_at: endedAt } : {}), + streams: session.streams, + ...(opts.reason ? { reason: opts.reason } : {}), + ...(opts.error ? { error: opts.error } : {}), + ...(Object.keys(meta).length > 0 ? { meta } : {}), + } as never, + }) as ChangeEvent + ) + await config.wakeSession.commitManifestEntries() + } + function structuralHash(nextConfig: UseContextConfig): string { const sources = Object.entries(nextConfig.sources) .sort(([leftName], [rightName]) => leftName.localeCompare(rightName)) @@ -1193,6 +1257,8 @@ export function createHandlerContext( timelineToMessages(config.db) ) let realtimeIo: RealtimeStreamIo | undefined + const realtimeSession = activeRealtimeSession() + let realtimeSessionTerminalWritten = false async function handleProviderEvent( event: RealtimeProviderEvent @@ -1306,12 +1372,13 @@ export function createHandlerContext( messages, tools: providerTools, audio: activeRealtimeConfig.audio, - session: activeRealtimeSession(), + session: realtimeSession, signal: config.runSignal, }) + await updateRealtimeSessionStatus(realtimeSession, `active`) realtimeIo = createRealtimeStreamIo( config, - activeRealtimeSession(), + realtimeSession, activeRealtimeProviderSession ) @@ -1325,6 +1392,10 @@ export function createHandlerContext( } endText() + await updateRealtimeSessionStatus(realtimeSession, `closed`, { + reason: config.runSignal?.aborted ? `aborted` : `completed`, + }) + realtimeSessionTerminalWritten = true bridge.onStepEnd({ finishReason: config.runSignal?.aborted ? `aborted` : `stop`, durationMs: Date.now() - startedAt, @@ -1334,6 +1405,12 @@ export function createHandlerContext( }) } catch (error) { endText() + if (!realtimeSessionTerminalWritten) { + await updateRealtimeSessionStatus(realtimeSession, `failed`, { + error: error instanceof Error ? error.message : String(error), + }) + realtimeSessionTerminalWritten = true + } bridge.onStepEnd({ finishReason: `error`, durationMs: Date.now() - startedAt, diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts index 649fe1a25c..72fdb5ea46 100644 --- a/packages/agents-runtime/src/openai-realtime.ts +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -470,6 +470,26 @@ export function createOpenAIRealtimeProvider( const toolsByName = new Map( input.tools.map((tool) => [toolName(tool), tool]) ) + let socketOpen = false + let socketClosed = false + let rejectOpen: ((error: Error) => void) | undefined + + const closeQueue = (reason?: string): void => { + if (socketClosed) return + socketClosed = true + queue.push({ type: `session.closed`, reason }) + queue.close() + input.signal?.removeEventListener(`abort`, handleAbort) + } + + const handleAbort = (): void => { + const error = new Error( + `[agent-runtime] OpenAI realtime WebSocket aborted` + ) + closeQueue(`aborted`) + ws.close?.(1000, `aborted`) + if (!socketOpen) rejectOpen?.(error) + } const sendToolResult = async ( result: RealtimeToolResult @@ -543,12 +563,22 @@ export function createOpenAIRealtimeProvider( } const opened = new Promise((resolve, reject) => { - onSocket(ws, `open`, () => resolve()) + rejectOpen = reject + onSocket(ws, `open`, () => { + if (socketClosed) return + socketOpen = true + if (input.signal?.aborted) { + handleAbort() + return + } + resolve() + }) onSocket(ws, `error`, (event) => { const error = event instanceof Error ? event : new Error(`[agent-runtime] OpenAI realtime WebSocket error`) + input.signal?.removeEventListener(`abort`, handleAbort) queue.fail(error) reject(error) }) @@ -569,10 +599,15 @@ export function createOpenAIRealtimeProvider( } }) onSocket(ws, `close`, () => { - queue.push({ type: `session.closed` }) - queue.close() + closeQueue() }) + if (input.signal?.aborted) { + handleAbort() + } else { + input.signal?.addEventListener(`abort`, handleAbort, { once: true }) + } + await opened sendJson(ws, buildSessionUpdate(opts, input)) for (const message of input.messages) { @@ -589,6 +624,7 @@ export function createOpenAIRealtimeProvider( }, commitInputAudio: async () => { sendJson(ws, { type: `input_audio_buffer.commit` }) + sendJson(ws, { type: `response.create` }) }, sendText: async (text) => { sendJson(ws, { @@ -605,9 +641,17 @@ export function createOpenAIRealtimeProvider( cancelResponse: async () => { sendJson(ws, { type: `response.cancel` }) }, + truncateOutputAudio: async ({ itemId, audioEndMs }) => { + sendJson(ws, { + type: `conversation.item.truncate`, + item_id: itemId, + content_index: 0, + audio_end_ms: audioEndMs, + }) + }, close: async (reason) => { + closeQueue(reason) ws.close?.(1000, reason) - queue.close() }, } }, diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts index de34722ea6..9fc88444c3 100644 --- a/packages/agents-runtime/test/openai-realtime.test.ts +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -132,11 +132,63 @@ describe(`createOpenAIRealtimeProvider`, () => { await session.appendInputAudio?.(new Uint8Array([1, 2, 3])) await session.commitInputAudio?.() - expect(socket.sent.at(-2)).toEqual({ + expect(socket.sent.at(-3)).toEqual({ type: `input_audio_buffer.append`, audio: `AQID`, }) - expect(socket.sent.at(-1)).toEqual({ type: `input_audio_buffer.commit` }) + expect(socket.sent.at(-2)).toEqual({ type: `input_audio_buffer.commit` }) + expect(socket.sent.at(-1)).toEqual({ type: `response.create` }) + }) + + it(`unblocks the event stream when the run signal aborts`, async () => { + FakeWebSocket.instances = [] + const controller = new AbortController() + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + signal: controller.signal, + }) + const iterator = session.events[Symbol.asyncIterator]() + + controller.abort() + + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `session.closed`, + reason: `aborted`, + }) + }) + + it(`can truncate output audio for interrupted playback`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + + await session.truncateOutputAudio?.({ + itemId: `item-1`, + audioEndMs: 320, + }) + + expect(socket.sent.at(-1)).toEqual({ + type: `conversation.item.truncate`, + item_id: `item-1`, + content_index: 0, + audio_end_ms: 320, + }) }) it(`maps OpenAI events and executes function calls`, async () => { diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index a16c88ccb1..7a4c129ccb 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -92,6 +92,96 @@ describe(`ctx.useRealtime()`, () => { }) }) + it(`marks realtime sessions closed when the provider stream ends`, async () => { + const { ctx } = createTestHandlerContext() + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ response: `done` }), + tools: [], + }) + + await realtime.run() + + expect(ctx.realtime.activeSession()).toBeUndefined() + expect( + ctx.db.collections.manifests.get(`realtime-session:rt-1`) + ).toMatchObject({ + status: `closed`, + endedAt: expect.any(String), + meta: { reason: `completed` }, + }) + expect( + ctx.db.collections.realtimeSessions.get(`realtime-session:rt-1`) + ).toMatchObject({ + status: `closed`, + ended_at: expect.any(String), + reason: `completed`, + }) + }) + + it(`marks realtime sessions failed when provider setup fails`, async () => { + const { ctx } = createTestHandlerContext() + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `openai`, + model: `gpt-realtime-2`, + connect: async () => { + throw new Error(`missing key`) + }, + }, + tools: [], + }) + + await expect(realtime.run()).rejects.toThrow(`missing key`) + expect(ctx.realtime.activeSession()).toBeUndefined() + expect( + ctx.db.collections.manifests.get(`realtime-session:rt-1`) + ).toMatchObject({ + status: `failed`, + endedAt: expect.any(String), + meta: { error: `missing key` }, + }) + }) + it(`persists provider audio and control output to realtime durable streams`, async () => { const { ctx } = createTestHandlerContext({ realtimeStreams: { diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index 524868dfe4..8d110e8634 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -18,6 +18,15 @@ type RealtimeSessionCreateResult = { } } +type RealtimeControlOutput = + | { type: `input_audio.speech_started`; audioOffset?: string } + | { type: `output_audio.delta`; itemId?: string; byteLength?: number } + | { type: `output_audio.completed`; responseId?: string; itemId?: string } + | { type: `response.completed`; responseId?: string } + | { type: `response.cancelled`; responseId?: string } + | { type: `session.closed`; reason?: string } + | { type: string; [key: string]: unknown } + const REALTIME_SAMPLE_RATE = 24_000 function realtimeUrl(baseUrl: string): string { @@ -51,6 +60,10 @@ function pcm16Floats(bytes: Uint8Array): Float32Array { return output } +function jsonBytes(value: unknown): Uint8Array { + return new TextEncoder().encode(JSON.stringify(value)) +} + function streamHandle( baseUrl: string, path: string, @@ -108,112 +121,217 @@ export async function startRealtimeAudioSession({ baseUrl: string entityUrl: string }): Promise { - const session = await createRealtimeSession(baseUrl, entityUrl) const abort = new AbortController() const micContext = createAudioContext() const playbackContext = createAudioContext() - const media = await navigator.mediaDevices.getUserMedia({ - audio: { - channelCount: 1, - sampleRate: REALTIME_SAMPLE_RATE, - echoCancellation: true, - noiseSuppression: true, - autoGainControl: true, - }, - }) - const audioIn = streamHandle( - baseUrl, - session.streams.audio_in, - `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` - ) - const audioOut = streamHandle( - baseUrl, - session.streams.audio_out, - `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` - ) - const controlIn = streamHandle( - baseUrl, - session.streams.control_in, - `application/json` - ) - - const source = micContext.createMediaStreamSource(media) - const processor = micContext.createScriptProcessor(1024, 1, 1) - const silentOutput = micContext.createGain() - silentOutput.gain.value = 0 let appendQueue = Promise.resolve() - processor.onaudioprocess = (event) => { - if (abort.signal.aborted) return - const input = event.inputBuffer.getChannelData(0) - const bytes = pcm16Bytes(input) - appendQueue = appendQueue - .then(() => audioIn.append(bytes)) - .catch((error) => { - console.warn(`[realtime-audio] microphone append failed`, error) - }) + let playback = Promise.resolve() + let control = Promise.resolve() + let media: MediaStream | undefined + let source: MediaStreamAudioSourceNode | undefined + let processor: ScriptProcessorNode | undefined + let silentOutput: GainNode | undefined + let controlIn: DurableStream | undefined + let session: RealtimeSessionCreateResult | undefined + let nextPlaybackTime = playbackContext.currentTime + let currentOutputItemId: string | null = null + let currentOutputStartedAt: number | null = null + const playbackNodes = new Set() + + const appendControl = async (value: unknown): Promise => { + await controlIn?.append(jsonBytes(value)) } - source.connect(processor) - processor.connect(silentOutput) - silentOutput.connect(micContext.destination) - let nextPlaybackTime = playbackContext.currentTime - const playback = (async () => { - const response = await audioOut.stream({ - live: true, - signal: abort.signal, - warnOnHttp: false, - }) - try { - for await (const chunk of response.bodyStream()) { - if (abort.signal.aborted || chunk.byteLength === 0) continue - const samples = pcm16Floats(chunk) - const buffer = playbackContext.createBuffer( - 1, - samples.length, - REALTIME_SAMPLE_RATE - ) - const channel = new Float32Array(samples.length) - channel.set(samples) - buffer.copyToChannel(channel, 0) - const node = playbackContext.createBufferSource() - node.buffer = buffer - node.connect(playbackContext.destination) - const startAt = Math.max(playbackContext.currentTime, nextPlaybackTime) - node.start(startAt) - nextPlaybackTime = startAt + buffer.duration + const stopScheduledPlayback = (): void => { + for (const node of playbackNodes) { + try { + node.stop() + } catch { + // Already stopped. } - } finally { - response.cancel() } - })().catch((error) => { - if (!abort.signal.aborted) { - console.warn(`[realtime-audio] playback stream failed`, error) + playbackNodes.clear() + nextPlaybackTime = playbackContext.currentTime + currentOutputStartedAt = null + } + + const interruptPlayback = (): void => { + const audioEndMs = + currentOutputStartedAt === null + ? 0 + : Math.max( + 0, + Math.floor( + (playbackContext.currentTime - currentOutputStartedAt) * 1000 + ) + ) + const itemId = currentOutputItemId + stopScheduledPlayback() + void appendControl({ type: `response.cancel` }).catch((error) => { + console.warn(`[realtime-audio] response cancel failed`, error) + }) + if (itemId) { + void appendControl({ + type: `output_audio.truncate`, + itemId, + audioEndMs, + }).catch((error) => { + console.warn(`[realtime-audio] output truncate failed`, error) + }) + } + } + + const cleanup = async (sendClose: boolean): Promise => { + abort.abort() + processor?.disconnect() + silentOutput?.disconnect() + source?.disconnect() + for (const track of media?.getTracks() ?? []) track.stop() + stopScheduledPlayback() + await appendQueue.catch(() => undefined) + if (sendClose && controlIn) { + await appendControl({ + type: `session.close`, + reason: `client-stop`, + }).catch(() => undefined) } - }) + await Promise.allSettled([playback, control]) + await Promise.allSettled([micContext.close(), playbackContext.close()]) + } - return { - sessionId: session.sessionId, - async sendText(text: string) { - await controlIn.append( - new TextEncoder().encode(JSON.stringify({ type: `input_text`, text })) - ) - }, - async stop() { - abort.abort() - processor.disconnect() - silentOutput.disconnect() - source.disconnect() - for (const track of media.getTracks()) track.stop() - await appendQueue.catch(() => undefined) - await controlIn - .append( - new TextEncoder().encode( - JSON.stringify({ type: `session.close`, reason: `client-stop` }) + try { + media = await navigator.mediaDevices.getUserMedia({ + audio: { + channelCount: 1, + sampleRate: REALTIME_SAMPLE_RATE, + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + }, + }) + session = await createRealtimeSession(baseUrl, entityUrl) + const audioIn = streamHandle( + baseUrl, + session.streams.audio_in, + `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` + ) + const audioOut = streamHandle( + baseUrl, + session.streams.audio_out, + `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` + ) + controlIn = streamHandle( + baseUrl, + session.streams.control_in, + `application/json` + ) + const controlOut = streamHandle( + baseUrl, + session.streams.control_out, + `application/json` + ) + + source = micContext.createMediaStreamSource(media) + processor = micContext.createScriptProcessor(1024, 1, 1) + silentOutput = micContext.createGain() + silentOutput.gain.value = 0 + processor.onaudioprocess = (event) => { + if (abort.signal.aborted) return + const input = event.inputBuffer.getChannelData(0) + const bytes = pcm16Bytes(input) + appendQueue = appendQueue + .then(() => audioIn.append(bytes)) + .catch((error) => { + console.warn(`[realtime-audio] microphone append failed`, error) + }) + } + source.connect(processor) + processor.connect(silentOutput) + silentOutput.connect(micContext.destination) + + playback = (async () => { + const response = await audioOut.stream({ + live: true, + signal: abort.signal, + warnOnHttp: false, + }) + try { + for await (const chunk of response.bodyStream()) { + if (abort.signal.aborted || chunk.byteLength === 0) continue + const samples = pcm16Floats(chunk) + const buffer = playbackContext.createBuffer( + 1, + samples.length, + REALTIME_SAMPLE_RATE + ) + const channel = new Float32Array(samples.length) + channel.set(samples) + buffer.copyToChannel(channel, 0) + const node = playbackContext.createBufferSource() + node.buffer = buffer + node.connect(playbackContext.destination) + node.onended = () => playbackNodes.delete(node) + playbackNodes.add(node) + const startAt = Math.max( + playbackContext.currentTime, + nextPlaybackTime ) - ) - .catch(() => undefined) - await playback - await Promise.allSettled([micContext.close(), playbackContext.close()]) - }, + if (currentOutputStartedAt === null) { + currentOutputStartedAt = startAt + } + node.start(startAt) + nextPlaybackTime = startAt + buffer.duration + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + console.warn(`[realtime-audio] playback stream failed`, error) + } + }) + + control = (async () => { + const response = await controlOut.stream({ + live: true, + signal: abort.signal, + json: true, + warnOnHttp: false, + }) + try { + for await (const event of response.jsonStream()) { + if (abort.signal.aborted || !event || typeof event !== `object`) { + continue + } + if ( + event.type === `output_audio.delta` && + typeof event.itemId === `string` + ) { + currentOutputItemId = event.itemId + } else if (event.type === `input_audio.speech_started`) { + interruptPlayback() + } + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + console.warn(`[realtime-audio] control stream failed`, error) + } + }) + + return { + sessionId: session.sessionId, + async sendText(text: string) { + await appendControl({ type: `input_text`, text }) + }, + async stop() { + await cleanup(true) + }, + } + } catch (error) { + await cleanup(Boolean(session)) + throw error } } diff --git a/packages/agents-server/src/entity-manager.ts b/packages/agents-server/src/entity-manager.ts index f3a09b82a9..b6db919629 100644 --- a/packages/agents-server/src/entity-manager.ts +++ b/packages/agents-server/src/entity-manager.ts @@ -2573,6 +2573,13 @@ export class EntityManager { 400 ) } + if (provider !== `openai`) { + throw new ElectricAgentsError( + ErrCodeInvalidRequest, + `Realtime provider "${provider}" is not supported; expected "openai"`, + 400 + ) + } const sessionId = req.id ?? `rt-${randomUUID()}` validateRealtimeSessionId(sessionId) diff --git a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts index ffa8ec7fc9..c4922ec140 100644 --- a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts +++ b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts @@ -234,6 +234,21 @@ describe(`ElectricAgentsManager realtime sessions`, () => { }) expect(inboxEvent.value).not.toHaveProperty(`message_type`) }) + + it(`rejects non-OpenAI realtime providers in V1`, async () => { + const { manager } = createAttachmentManager() + + await expect( + manager.createRealtimeSession(`/chat/session-1`, { + id: `rt-1`, + provider: `other`, + model: `other-realtime`, + }) + ).rejects.toMatchObject({ + status: 400, + message: `Realtime provider "other" is not supported; expected "openai"`, + }) + }) }) describe(`ElectricAgentsManager attachments`, () => { diff --git a/packages/agents/src/agents/horton.ts b/packages/agents/src/agents/horton.ts index 9033a6b3ee..e556921b01 100644 --- a/packages/agents/src/agents/horton.ts +++ b/packages/agents/src/agents/horton.ts @@ -710,16 +710,18 @@ function createAssistantHandler(options: { `Horton realtime currently supports provider "openai", got "${activeRealtimeSession.provider}"` ) } - const apiKey = process.env.OPENAI_API_KEY - if (!apiKey) { - throw new Error( - `OPENAI_API_KEY must be set before starting Horton realtime mode` - ) - } const realtime = ctx.useRealtime({ systemPrompt: hortonRealtimeSystemPrompt(systemPrompt), provider: createOpenAIRealtimeProvider({ - apiKey, + apiKey: () => { + const apiKey = process.env.OPENAI_API_KEY + if (!apiKey) { + throw new Error( + `OPENAI_API_KEY must be set before starting Horton realtime mode` + ) + } + return apiKey + }, model: activeRealtimeSession.model, }), tools: tools as AgentTool[], From f980ea1a69a1795b7f748ec49022de4bb17880d0 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 13:04:17 +0100 Subject: [PATCH 11/31] fix(agents): make realtime voice input activate reliably --- packages/agents-runtime/src/openai-realtime.ts | 16 +++++++++++++++- .../agents-runtime/test/openai-realtime.test.ts | 12 +++++++++++- .../agents-server-ui/src/lib/realtime-audio.ts | 5 +++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts index 72fdb5ea46..026a5294d1 100644 --- a/packages/agents-runtime/src/openai-realtime.ts +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -245,7 +245,21 @@ function buildSessionUpdate( ...(inputFormat || outputFormat || opts.voice ? { audio: { - ...(inputFormat ? { input: { format: inputFormat } } : {}), + ...(inputFormat + ? { + input: { + format: inputFormat, + turn_detection: { + type: `server_vad`, + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 200, + create_response: true, + interrupt_response: true, + }, + }, + } + : {}), ...(outputFormat || opts.voice ? { output: { diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts index 9fc88444c3..ea981e9c43 100644 --- a/packages/agents-runtime/test/openai-realtime.test.ts +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -100,7 +100,17 @@ describe(`createOpenAIRealtimeProvider`, () => { }, ], audio: { - input: { format: { type: `audio/pcm`, rate: 24_000 } }, + input: { + format: { type: `audio/pcm`, rate: 24_000 }, + turn_detection: { + type: `server_vad`, + threshold: 0.5, + prefix_padding_ms: 300, + silence_duration_ms: 200, + create_response: true, + interrupt_response: true, + }, + }, output: { format: { type: `audio/pcm`, rate: 24_000 } }, }, }, diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index 8d110e8634..f5c14aefc0 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -124,6 +124,10 @@ export async function startRealtimeAudioSession({ const abort = new AbortController() const micContext = createAudioContext() const playbackContext = createAudioContext() + const resumeAudioContexts = Promise.allSettled([ + micContext.resume(), + playbackContext.resume(), + ]) let appendQueue = Promise.resolve() let playback = Promise.resolve() let control = Promise.resolve() @@ -209,6 +213,7 @@ export async function startRealtimeAudioSession({ autoGainControl: true, }, }) + await resumeAudioContexts session = await createRealtimeSession(baseUrl, entityUrl) const audioIn = streamHandle( baseUrl, From b5fe6c3b01fc7d185024a8c1b86e91c97a664718 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 13:15:31 +0100 Subject: [PATCH 12/31] fix(agents): avoid inactive realtime response cancel --- .../agents-runtime/src/context-factory.ts | 7 +++++ .../test/realtime-context.test.ts | 27 +++++++++++++++++++ .../src/lib/realtime-audio.ts | 21 +++++++-------- 3 files changed, 43 insertions(+), 12 deletions(-) diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 622b15ee45..183e9ba8df 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -1282,6 +1282,13 @@ export function createHandlerContext( break case `session.error`: + if (event.code === `response_cancel_not_active`) { + runtimeLog.warn( + `[agent-runtime]`, + `realtime provider ignored inactive response cancellation: ${event.error}` + ) + break + } throw new Error( `[agent-runtime] realtime provider error${event.code ? ` ${event.code}` : ``}: ${event.error}` ) diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index 7a4c129ccb..a07f37f339 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -182,6 +182,33 @@ describe(`ctx.useRealtime()`, () => { }) }) + it(`does not fail the run when OpenAI reports inactive response cancellation`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started` }, + { + type: `session.error`, + code: `response_cancel_not_active`, + error: `Cancellation failed: no active response found`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await expect(realtime.run()).resolves.toMatchObject({ + usage: { tokens: 0 }, + }) + expect(ctx.db.collections.runs.toArray).toMatchObject([ + { status: `completed`, finish_reason: `stop` }, + ]) + }) + it(`persists provider audio and control output to realtime durable streams`, async () => { const { ctx } = createTestHandlerContext({ realtimeStreams: { diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index f5c14aefc0..4f795b916b 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -160,6 +160,9 @@ export async function startRealtimeAudioSession({ } const interruptPlayback = (): void => { + const itemId = currentOutputItemId + if (!itemId) return + const audioEndMs = currentOutputStartedAt === null ? 0 @@ -169,20 +172,14 @@ export async function startRealtimeAudioSession({ (playbackContext.currentTime - currentOutputStartedAt) * 1000 ) ) - const itemId = currentOutputItemId stopScheduledPlayback() - void appendControl({ type: `response.cancel` }).catch((error) => { - console.warn(`[realtime-audio] response cancel failed`, error) + void appendControl({ + type: `output_audio.truncate`, + itemId, + audioEndMs, + }).catch((error) => { + console.warn(`[realtime-audio] output truncate failed`, error) }) - if (itemId) { - void appendControl({ - type: `output_audio.truncate`, - itemId, - audioEndMs, - }).catch((error) => { - console.warn(`[realtime-audio] output truncate failed`, error) - }) - } } const cleanup = async (sendClose: boolean): Promise => { From 45ea73b441f2173009a4bc20f4ef0c6473f260af Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 13:24:05 +0100 Subject: [PATCH 13/31] fix(agents): use supported OpenAI realtime model --- .../agents-runtime/src/openai-realtime.ts | 90 ++++++++++++++++--- .../test/electric-agents-client.test.ts | 6 +- .../test/openai-realtime.test.ts | 30 ++++++- .../test/realtime-context.test.ts | 10 +-- ...time-server-client-update-metadata.test.ts | 8 +- .../src/lib/realtime-audio.ts | 2 +- ...ic-agents-manager-write-validation.test.ts | 6 +- .../test/horton-tool-composition.test.ts | 4 +- 8 files changed, 123 insertions(+), 33 deletions(-) diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts index 026a5294d1..f3b36676ae 100644 --- a/packages/agents-runtime/src/openai-realtime.ts +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -30,6 +30,8 @@ type OpenAIRealtimeWebSocketConstructor = new ( init?: unknown ) => OpenAIRealtimeSocket +const DEFAULT_OPENAI_REALTIME_MODEL = `gpt-realtime` + export interface OpenAIRealtimeProviderOptions { apiKey: string | (() => MaybePromise) model?: string @@ -44,15 +46,18 @@ type OpenAIRealtimeEvent = Record & { type?: string } class AsyncEventQueue implements AsyncIterable { private values: Array = [] - private resolvers: Array<(value: IteratorResult) => void> = [] + private resolvers: Array<{ + resolve: (value: IteratorResult) => void + reject: (error: unknown) => void + }> = [] private closed = false private error: unknown push(value: T): void { if (this.closed) return - const resolve = this.resolvers.shift() - if (resolve) { - resolve({ value, done: false }) + const resolver = this.resolvers.shift() + if (resolver) { + resolver.resolve({ value, done: false }) return } this.values.push(value) @@ -61,14 +66,18 @@ class AsyncEventQueue implements AsyncIterable { close(): void { if (this.closed) return this.closed = true - for (const resolve of this.resolvers.splice(0)) { - resolve({ value: undefined as T, done: true }) + for (const resolver of this.resolvers.splice(0)) { + resolver.resolve({ value: undefined as T, done: true }) } } fail(error: unknown): void { + if (this.closed) return this.error = error - this.close() + this.closed = true + for (const resolver of this.resolvers.splice(0)) { + resolver.reject(error) + } } [Symbol.asyncIterator](): AsyncIterator { @@ -83,8 +92,8 @@ class AsyncEventQueue implements AsyncIterable { if (this.closed) { return Promise.resolve({ value: undefined as T, done: true }) } - return new Promise>((resolve) => { - this.resolvers.push(resolve) + return new Promise>((resolve, reject) => { + this.resolvers.push({ resolve, reject }) }) }, } @@ -123,6 +132,48 @@ function socketMessageData(args: Array): unknown { return first } +function socketCloseDetails(args: Array): { + code?: number + reason?: string + wasClean?: boolean +} { + const [first, second] = args + if (typeof first === `number`) { + return { + code: first, + reason: second === undefined ? undefined : dataToString(second), + } + } + if (!first || typeof first !== `object`) return {} + const event = first as { + code?: unknown + reason?: unknown + wasClean?: unknown + } + return { + code: typeof event.code === `number` ? event.code : undefined, + reason: + typeof event.reason === `string` + ? event.reason + : event.reason === undefined + ? undefined + : dataToString(event.reason), + wasClean: typeof event.wasClean === `boolean` ? event.wasClean : undefined, + } +} + +function socketCloseError(details: { + code?: number + reason?: string + wasClean?: boolean +}): string { + const parts = [`OpenAI realtime WebSocket closed before client stop`] + if (details.code !== undefined) parts.push(`code=${details.code}`) + if (details.reason) parts.push(`reason=${details.reason}`) + if (details.wasClean !== undefined) parts.push(`clean=${details.wasClean}`) + return parts.join(` `) +} + function dataToString(data: unknown): string { if (typeof data === `string`) return data if (data instanceof ArrayBuffer) return new TextDecoder().decode(data) @@ -235,7 +286,7 @@ function buildSessionUpdate( type: `session.update`, session: { type: `realtime`, - model: opts.model ?? `gpt-realtime-2`, + model: opts.model ?? DEFAULT_OPENAI_REALTIME_MODEL, instructions: input.systemPrompt, output_modalities: outputFormat ? [`audio`] : [`text`], tool_choice: input.tools.length > 0 ? `auto` : `none`, @@ -456,7 +507,7 @@ function mapOpenAIEvent( export function createOpenAIRealtimeProvider( opts: OpenAIRealtimeProviderOptions ): RealtimeProviderConfig { - const model = opts.model ?? `gpt-realtime-2` + const model = opts.model ?? DEFAULT_OPENAI_REALTIME_MODEL return { id: `openai`, @@ -486,6 +537,7 @@ export function createOpenAIRealtimeProvider( ) let socketOpen = false let socketClosed = false + let clientCloseRequested = false let rejectOpen: ((error: Error) => void) | undefined const closeQueue = (reason?: string): void => { @@ -500,6 +552,7 @@ export function createOpenAIRealtimeProvider( const error = new Error( `[agent-runtime] OpenAI realtime WebSocket aborted` ) + clientCloseRequested = true closeQueue(`aborted`) ws.close?.(1000, `aborted`) if (!socketOpen) rejectOpen?.(error) @@ -612,8 +665,18 @@ export function createOpenAIRealtimeProvider( queue.fail(error) } }) - onSocket(ws, `close`, () => { - closeQueue() + onSocket(ws, `close`, (...args) => { + const details = socketCloseDetails(args) + if (clientCloseRequested || input.signal?.aborted) { + closeQueue(details.reason || undefined) + return + } + queue.push({ + type: `session.error`, + code: `websocket_closed`, + error: socketCloseError(details), + }) + closeQueue(details.reason || `websocket_closed`) }) if (input.signal?.aborted) { @@ -664,6 +727,7 @@ export function createOpenAIRealtimeProvider( }) }, close: async (reason) => { + clientCloseRequested = true closeQueue(reason) ws.close?.(1000, reason) }, diff --git a/packages/agents-runtime/test/electric-agents-client.test.ts b/packages/agents-runtime/test/electric-agents-client.test.ts index 309d7512df..b493cc3aa7 100644 --- a/packages/agents-runtime/test/electric-agents-client.test.ts +++ b/packages/agents-runtime/test/electric-agents-client.test.ts @@ -55,7 +55,7 @@ describe(`createAgentsClient`, () => { sessionId: `rt-1`, entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `requested`, startedAt: `2026-06-09T10:00:00.000Z`, streams: { @@ -176,7 +176,7 @@ describe(`createAgentsClient`, () => { client.startRealtimeSession({ entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, }) ).resolves.toMatchObject({ sessionId: `rt-1`, @@ -188,7 +188,7 @@ describe(`createAgentsClient`, () => { expect(mockState.startRealtimeSession).toHaveBeenCalledWith({ entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, }) }) diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts index ea981e9c43..5918853c54 100644 --- a/packages/agents-runtime/test/openai-realtime.test.ts +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -76,7 +76,7 @@ describe(`createOpenAIRealtimeProvider`, () => { const socket = FakeWebSocket.instances[0]! expect(socket.url).toBe( - `wss://api.openai.com/v1/realtime?model=gpt-realtime-2` + `wss://api.openai.com/v1/realtime?model=gpt-realtime` ) expect(socket.init).toEqual({ headers: { @@ -88,7 +88,7 @@ describe(`createOpenAIRealtimeProvider`, () => { type: `session.update`, session: { type: `realtime`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, instructions: `You are Horton.`, output_modalities: [`audio`], tool_choice: `auto`, @@ -174,6 +174,32 @@ describe(`createOpenAIRealtimeProvider`, () => { }) }) + it(`surfaces unexpected WebSocket closes as provider errors`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emit(`close`, { code: 1008, reason: `invalid model` }) + + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `session.error`, + code: `websocket_closed`, + error: + `OpenAI realtime WebSocket closed before client stop ` + + `code=1008 reason=invalid model`, + }) + }) + it(`can truncate output audio for interrupted playback`, async () => { FakeWebSocket.instances = [] const provider = createOpenAIRealtimeProvider({ diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index a07f37f339..2c488c2dae 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -73,7 +73,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -100,7 +100,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `requested`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -146,7 +146,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `requested`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -163,7 +163,7 @@ describe(`ctx.useRealtime()`, () => { systemPrompt: `You are realtime.`, provider: { id: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, connect: async () => { throw new Error(`missing key`) }, @@ -221,7 +221,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, diff --git a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts index c88a9ebad4..3290f6b91a 100644 --- a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts +++ b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts @@ -143,7 +143,7 @@ describe(`runtime-server-client realtime sessions`, () => { sessionId: `rt-1`, entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `requested`, startedAt: `2026-06-09T10:00:00.000Z`, streams: { @@ -171,7 +171,7 @@ describe(`runtime-server-client realtime sessions`, () => { entityUrl: `/horton/demo`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, meta: { source: `button` }, }) @@ -189,7 +189,7 @@ describe(`runtime-server-client realtime sessions`, () => { entityUrl: `/horton/demo`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, meta: { source: `button` }, }) @@ -208,7 +208,7 @@ describe(`runtime-server-client realtime sessions`, () => { client.startRealtimeSession({ entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, }) ).rejects.toThrow(/startRealtimeSession.*401.*not allowed/) }) diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index 4f795b916b..bb87b08a33 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -92,7 +92,7 @@ async function createRealtimeSession( body: JSON.stringify({ entityUrl, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, inputAudio: { codec: `pcm16`, sampleRate: REALTIME_SAMPLE_RATE, diff --git a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts index c4922ec140..c4a5116f00 100644 --- a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts +++ b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts @@ -158,7 +158,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { const result = await manager.createRealtimeSession(`/chat/session-1`, { id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, outputAudio: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, meta: { source: `test` }, @@ -203,7 +203,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `requested`, streams: result.streams, retention: `forever`, @@ -216,7 +216,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { value: { session_id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `requested`, streams: result.streams, }, diff --git a/packages/agents/test/horton-tool-composition.test.ts b/packages/agents/test/horton-tool-composition.test.ts index 8c05e003b4..f2caa8db41 100644 --- a/packages/agents/test/horton-tool-composition.test.ts +++ b/packages/agents/test/horton-tool-composition.test.ts @@ -167,7 +167,7 @@ describe(`horton tool composition`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, retention: `forever`, @@ -208,7 +208,7 @@ describe(`horton tool composition`, () => { )[0]![0] expect(realtimeConfig.provider).toMatchObject({ id: `openai`, - model: `gpt-realtime-2`, + model: `gpt-realtime`, }) expect(realtimeConfig.toolPolicy.direct).toEqual( expect.arrayContaining([ From 9ef1763f554cbf26520dce3a3d4b39176c1db871 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 13:46:17 +0100 Subject: [PATCH 14/31] fix(agents): wire realtime audio path --- .../agents-runtime/src/context-factory.ts | 46 +++++++++++++++++ .../agents-runtime/src/openai-realtime.ts | 4 ++ .../agents-runtime/src/timeline-context.ts | 16 ++++++ .../test/openai-realtime.test.ts | 51 +++++++++++++++++++ .../src/components/EntityTimeline.tsx | 20 +++++++- .../src/lib/realtime-audio.ts | 24 +++++++++ packages/agents-server/src/entity-manager.ts | 39 ++++++++++---- ...ic-agents-manager-write-validation.test.ts | 30 +++++++---- 8 files changed, 209 insertions(+), 21 deletions(-) diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 183e9ba8df..a2f3f923af 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -172,6 +172,7 @@ function createRealtimeStreamIo( ): RealtimeStreamIo | undefined { if (!config.realtimeStreams || !session) return undefined + const logPrefix = `[agent-runtime]` const abort = new AbortController() const abortFromRun = (): void => abort.abort() if (config.runSignal?.aborted) { @@ -201,6 +202,17 @@ function createRealtimeStreamIo( `application/json` ) const tasks: Array> = [] + let audioInChunks = 0 + let audioInBytes = 0 + let controlInCommands = 0 + let audioOutChunks = 0 + let audioOutBytes = 0 + let controlOutEvents = 0 + + runtimeLog.info( + logPrefix, + `realtime stream bridge starting session=${session.id} audioIn=${session.streams.audio_in} audioOut=${session.streams.audio_out}` + ) if (providerSession.appendInputAudio) { tasks.push( @@ -213,6 +225,14 @@ function createRealtimeStreamIo( try { for await (const chunk of response.bodyStream()) { if (abort.signal.aborted) break + audioInChunks += 1 + audioInBytes += chunk.byteLength + if (audioInChunks === 1) { + runtimeLog.info( + logPrefix, + `realtime audio/in first chunk session=${session.id} bytes=${chunk.byteLength}` + ) + } await providerSession.appendInputAudio?.(chunk) } } finally { @@ -242,6 +262,13 @@ function createRealtimeStreamIo( if (abort.signal.aborted || !isRealtimeControlInput(command)) { continue } + controlInCommands += 1 + if (controlInCommands === 1) { + runtimeLog.info( + logPrefix, + `realtime control/in first command session=${session.id} type=${command.type}` + ) + } switch (command.type) { case `input_text`: await providerSession.sendText?.(command.text) @@ -279,7 +306,22 @@ function createRealtimeStreamIo( return { async writeProviderEvent(event) { + controlOutEvents += 1 + if (controlOutEvents === 1) { + runtimeLog.info( + logPrefix, + `realtime provider first event session=${session.id} type=${event.type}` + ) + } if (event.type === `output_audio.delta`) { + audioOutChunks += 1 + audioOutBytes += event.audio.byteLength + if (audioOutChunks === 1) { + runtimeLog.info( + logPrefix, + `realtime audio/out first chunk session=${session.id} bytes=${event.audio.byteLength}` + ) + } await audioOut.append(event.audio) } await controlOut.append(jsonBytes(realtimeControlOutput(event))) @@ -288,6 +330,10 @@ function createRealtimeStreamIo( abort.abort() config.runSignal?.removeEventListener(`abort`, abortFromRun) await Promise.allSettled(tasks) + runtimeLog.info( + logPrefix, + `realtime stream bridge closed session=${session.id} audioInChunks=${audioInChunks} audioInBytes=${audioInBytes} controlInCommands=${controlInCommands} providerEvents=${controlOutEvents} audioOutChunks=${audioOutChunks} audioOutBytes=${audioOutBytes}` + ) }, } } diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts index f3b36676ae..7914c95cc3 100644 --- a/packages/agents-runtime/src/openai-realtime.ts +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -409,6 +409,7 @@ function mapOpenAIEvent( }, ] case `response.audio.delta`: + case `response.output_audio.delta`: return [ { type: `output_audio.delta`, @@ -421,6 +422,7 @@ function mapOpenAIEvent( }, ] case `response.audio.done`: + case `response.output_audio.done`: return [ { type: `output_audio.completed`, @@ -432,6 +434,7 @@ function mapOpenAIEvent( }, ] case `response.audio_transcript.delta`: + case `response.output_audio_transcript.delta`: case `response.output_text.delta`: return [ { @@ -444,6 +447,7 @@ function mapOpenAIEvent( }, ] case `response.audio_transcript.done`: + case `response.output_audio_transcript.done`: case `response.output_text.done`: return [ { diff --git a/packages/agents-runtime/src/timeline-context.ts b/packages/agents-runtime/src/timeline-context.ts index 461430da4a..a416e14d32 100644 --- a/packages/agents-runtime/src/timeline-context.ts +++ b/packages/agents-runtime/src/timeline-context.ts @@ -194,6 +194,21 @@ function renderSignalMessage(signal: Signal): LLMMessage { } } +function isRealtimeSessionWake(payload: unknown): boolean { + if (!payload || typeof payload !== `object`) return false + const changes = (payload as { changes?: unknown }).changes + if (!Array.isArray(changes)) return false + return changes.some((change) => { + if (!change || typeof change !== `object`) return false + const payload = (change as { payload?: unknown }).payload + return ( + !!payload && + typeof payload === `object` && + (payload as { type?: unknown }).type === `realtime_session.started` + ) + }) +} + export function defaultProjection( item: TimelineItem ): Array | null { @@ -202,6 +217,7 @@ export function defaultProjection( return [{ role: `user`, content: projectInboxPayload(item) }] case `wake`: + if (isRealtimeSessionWake(item.payload)) return null return [{ role: `user`, content: asString(item.payload) }] case `signal`: diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts index 5918853c54..9de8bf4f07 100644 --- a/packages/agents-runtime/test/openai-realtime.test.ts +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -227,6 +227,57 @@ describe(`createOpenAIRealtimeProvider`, () => { }) }) + it(`maps GA output audio and transcript events`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ + type: `response.output_audio.delta`, + response_id: `resp-1`, + item_id: `item-1`, + delta: `AQID`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `output_audio.delta`, + responseId: `resp-1`, + itemId: `item-1`, + audio: new Uint8Array([1, 2, 3]), + }) + + socket.emitMessage({ + type: `response.output_audio_transcript.delta`, + response_id: `resp-1`, + delta: `hello`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `output_transcript.delta`, + responseId: `resp-1`, + delta: `hello`, + }) + + socket.emitMessage({ + type: `response.output_audio.done`, + response_id: `resp-1`, + item_id: `item-1`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `output_audio.completed`, + responseId: `resp-1`, + itemId: `item-1`, + }) + }) + it(`maps OpenAI events and executes function calls`, async () => { FakeWebSocket.instances = [] const execute = vi.fn().mockResolvedValue({ diff --git a/packages/agents-server-ui/src/components/EntityTimeline.tsx b/packages/agents-server-ui/src/components/EntityTimeline.tsx index 214b806066..fe79c0dd52 100644 --- a/packages/agents-server-ui/src/components/EntityTimeline.tsx +++ b/packages/agents-server-ui/src/components/EntityTimeline.tsx @@ -101,6 +101,20 @@ function readInboxPayloadDisplay(payload: unknown): string { return stringifyPayload(payload, 2) } +function isRealtimeSessionWake(row: RenderTimelineRow): boolean { + const changes = row.wake?.payload.changes + if (!Array.isArray(changes)) return false + return changes.some((change) => { + if (!change || typeof change !== `object`) return false + const payload = (change as { payload?: unknown }).payload + return ( + !!payload && + typeof payload === `object` && + (payload as { type?: unknown }).type === `realtime_session.started` + ) + }) +} + function stringifySearchPayload(value: unknown): string { if (value == null) return `` if (typeof value === `string`) return value @@ -1144,7 +1158,11 @@ export function EntityTimeline({ const previousStreamingAgentKeyRef = useRef(null) const textColumnWidth = Math.max(0, contentWidth - CHAT_SURFACE_GUTTER) const displayRows = useMemo( - () => rows.filter((row) => !isAttachmentManifest(row.manifest)), + () => + rows.filter( + (row) => + !isAttachmentManifest(row.manifest) && !isRealtimeSessionWake(row) + ), [rows] ) const attachmentsByInboxKey = useMemo(() => { diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index bb87b08a33..ffbdb4552f 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -140,6 +140,9 @@ export async function startRealtimeAudioSession({ let nextPlaybackTime = playbackContext.currentTime let currentOutputItemId: string | null = null let currentOutputStartedAt: number | null = null + let micChunks = 0 + let playbackChunks = 0 + let controlEvents = 0 const playbackNodes = new Set() const appendControl = async (value: unknown): Promise => { @@ -212,6 +215,9 @@ export async function startRealtimeAudioSession({ }) await resumeAudioContexts session = await createRealtimeSession(baseUrl, entityUrl) + console.info( + `[realtime-audio] session started session=${session.sessionId} audioIn=${session.streams.audio_in} audioOut=${session.streams.audio_out}` + ) const audioIn = streamHandle( baseUrl, session.streams.audio_in, @@ -241,6 +247,12 @@ export async function startRealtimeAudioSession({ if (abort.signal.aborted) return const input = event.inputBuffer.getChannelData(0) const bytes = pcm16Bytes(input) + micChunks += 1 + if (micChunks === 1) { + console.info( + `[realtime-audio] microphone first chunk session=${session?.sessionId} bytes=${bytes.byteLength}` + ) + } appendQueue = appendQueue .then(() => audioIn.append(bytes)) .catch((error) => { @@ -260,6 +272,12 @@ export async function startRealtimeAudioSession({ try { for await (const chunk of response.bodyStream()) { if (abort.signal.aborted || chunk.byteLength === 0) continue + playbackChunks += 1 + if (playbackChunks === 1) { + console.info( + `[realtime-audio] playback first chunk session=${session?.sessionId} bytes=${chunk.byteLength}` + ) + } const samples = pcm16Floats(chunk) const buffer = playbackContext.createBuffer( 1, @@ -305,6 +323,12 @@ export async function startRealtimeAudioSession({ if (abort.signal.aborted || !event || typeof event !== `object`) { continue } + controlEvents += 1 + if (controlEvents === 1) { + console.info( + `[realtime-audio] control first event session=${session?.sessionId} type=${event.type}` + ) + } if ( event.type === `output_audio.delta` && typeof event.itemId === `string` diff --git a/packages/agents-server/src/entity-manager.ts b/packages/agents-server/src/entity-manager.ts index b6db919629..7e8b5d3bc6 100644 --- a/packages/agents-server/src/entity-manager.ts +++ b/packages/agents-server/src/entity-manager.ts @@ -2637,22 +2637,41 @@ export class EntityManager { streams, ...(req.meta ? { meta: req.meta } : {}), }, - } as never) + } as any) await this.streamClient.append( entity.streams.main, this.encodeChangeEvent(sessionEvent as Record) ) - await this.send(entityUrl, { - from: SERVER_SIGNAL_SENDER, - payload: { - type: `realtime_session.started`, - sessionId, - provider, - model, - streams, + const wakeEvent = entityStateSchema.wakes.insert({ + key: `wake-realtime-session-${sessionId}`, + value: { + timestamp: startedAt, + source: entityUrl, + timeout: false, + changes: [ + { + collection: `realtimeSessions`, + kind: `insert`, + key: manifestKey, + from: SERVER_SIGNAL_SENDER, + payload: { + type: `realtime_session.started`, + sessionId, + provider, + model, + streams, + }, + timestamp: startedAt, + message_type: `realtime_session.started`, + }, + ], }, - }) + } as any) + await this.streamClient.append( + entity.streams.main, + this.encodeChangeEvent(wakeEvent as Record) + ) } catch (error) { await Promise.allSettled( createdStreams.map((path) => this.streamClient.delete(path)) diff --git a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts index c4a5116f00..e8d309db32 100644 --- a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts +++ b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts @@ -193,7 +193,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { expect(append).toHaveBeenCalledTimes(3) const manifestEvent = decodeAppend(append.mock.calls[0]!) const sessionEvent = decodeAppend(append.mock.calls[1]!) - const inboxEvent = decodeAppend(append.mock.calls[2]!) + const wakeEvent = decodeAppend(append.mock.calls[2]!) expect(manifestEvent).toMatchObject({ type: `manifest`, @@ -221,18 +221,28 @@ describe(`ElectricAgentsManager realtime sessions`, () => { streams: result.streams, }, }) - expect(inboxEvent).toMatchObject({ - type: `inbox`, + expect(wakeEvent).toMatchObject({ + type: `wake`, + key: `wake-realtime-session-rt-1`, value: { - from: `/_electric/server`, - payload: { - type: `realtime_session.started`, - sessionId: `rt-1`, - streams: result.streams, - }, + source: `/chat/session-1`, + timeout: false, + changes: [ + { + collection: `realtimeSessions`, + kind: `insert`, + key: `realtime-session:rt-1`, + from: `/_electric/server`, + payload: { + type: `realtime_session.started`, + sessionId: `rt-1`, + streams: result.streams, + }, + message_type: `realtime_session.started`, + }, + ], }, }) - expect(inboxEvent.value).not.toHaveProperty(`message_type`) }) it(`rejects non-OpenAI realtime providers in V1`, async () => { From 1c64549d7afd19d6c99e1b92cf77cc5148453ad6 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 13:55:35 +0100 Subject: [PATCH 15/31] fix(agents): clamp realtime audio truncation --- .../agents-runtime/src/context-factory.ts | 11 ++++ .../test/realtime-context.test.ts | 27 ++++++++++ .../src/components/MessageInput.module.css | 32 ++++++++++++ .../src/components/MessageInput.tsx | 33 +++++++++++- .../src/lib/realtime-audio.ts | 50 +++++++++++++++++-- 5 files changed, 148 insertions(+), 5 deletions(-) diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index a2f3f923af..7fbec9f533 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -1335,6 +1335,17 @@ export function createHandlerContext( ) break } + if ( + event.code === `invalid_value` && + event.error.includes(`Audio content`) && + event.error.includes(`already shorter than`) + ) { + runtimeLog.warn( + `[agent-runtime]`, + `realtime provider ignored stale output audio truncate: ${event.error}` + ) + break + } throw new Error( `[agent-runtime] realtime provider error${event.code ? ` ${event.code}` : ``}: ${event.error}` ) diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index 2c488c2dae..0c7109d46c 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -209,6 +209,33 @@ describe(`ctx.useRealtime()`, () => { ]) }) + it(`does not fail the run when OpenAI reports a stale output audio truncate`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started` }, + { + type: `session.error`, + code: `invalid_value`, + error: `Audio content of 6350ms is already shorter than 8160ms`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await expect(realtime.run()).resolves.toMatchObject({ + usage: { tokens: 0 }, + }) + expect(ctx.db.collections.runs.toArray).toMatchObject([ + { status: `completed`, finish_reason: `stop` }, + ]) + }) + it(`persists provider audio and control output to realtime durable streams`, async () => { const { ctx } = createTestHandlerContext({ realtimeStreams: { diff --git a/packages/agents-server-ui/src/components/MessageInput.module.css b/packages/agents-server-ui/src/components/MessageInput.module.css index 5c2fbccf24..5ced0808eb 100644 --- a/packages/agents-server-ui/src/components/MessageInput.module.css +++ b/packages/agents-server-ui/src/components/MessageInput.module.css @@ -68,6 +68,38 @@ color: var(--ds-accent-11); } +.voiceMeter { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 2px; + width: 0; + height: 20px; + color: var(--ds-accent-11); + opacity: 0; + overflow: hidden; + transition: + opacity 0.12s ease, + width 0.12s ease; +} + +.voiceMeter[data-active='true'] { + width: 18px; + opacity: 1; +} + +.voiceMeterBar { + display: block; + width: 3px; + height: 14px; + border-radius: var(--ds-radius-full); + background: currentColor; + transform-origin: center bottom; + transition: + opacity 0.08s linear, + transform 0.08s linear; +} + .inlineIconButton:focus-visible { outline: 2px solid var(--ds-accent-a6); outline-offset: -2px; diff --git a/packages/agents-server-ui/src/components/MessageInput.tsx b/packages/agents-server-ui/src/components/MessageInput.tsx index 599b04d9e2..a05ca7cafa 100644 --- a/packages/agents-server-ui/src/components/MessageInput.tsx +++ b/packages/agents-server-ui/src/components/MessageInput.tsx @@ -91,6 +91,7 @@ export function MessageInput({ } | null>(null) const [realtimePending, setRealtimePending] = useState(false) const [realtimeActive, setRealtimeActive] = useState(false) + const [realtimeInputLevel, setRealtimeInputLevel] = useState(0) const realtimeSessionRef = useRef(null) const composerFocusRef = useRef<{ focus: () => void } | null>(null) const inputDisabled = disabled || writeDisabled @@ -258,19 +259,25 @@ export function MessageInput({ .catch((err: Error) => setError(err.message)) .finally(() => { setRealtimeActive(false) + setRealtimeInputLevel(0) setRealtimePending(false) }) return } if (!canUseRealtime) return setRealtimePending(true) - startRealtimeAudioSession({ baseUrl, entityUrl }) + startRealtimeAudioSession({ + baseUrl, + entityUrl, + onInputLevel: setRealtimeInputLevel, + }) .then((session) => { realtimeSessionRef.current = session setRealtimeActive(true) }) .catch((err: Error) => { setError(err.message) + setRealtimeInputLevel(0) }) .finally(() => { setRealtimePending(false) @@ -352,6 +359,12 @@ export function MessageInput({ ) const isButtonActive = canSubmit || (showStop && !stopDisabled) + const voiceLevel = realtimeActive ? realtimeInputLevel : 0 + const voiceBars = [ + Math.max(0.18, Math.min(1, 0.24 + voiceLevel * 0.76)), + Math.max(0.24, Math.min(1, 0.34 + voiceLevel * 0.9)), + Math.max(0.16, Math.min(1, 0.2 + voiceLevel * 0.82)), + ] const sendTooltip = showStop ? stopDisabled ? `Signal permission required` @@ -429,6 +442,24 @@ export function MessageInput({ + {imageAttachmentsEnabled ? ( void }): Promise { const abort = new AbortController() const micContext = createAudioContext() @@ -140,6 +159,7 @@ export async function startRealtimeAudioSession({ let nextPlaybackTime = playbackContext.currentTime let currentOutputItemId: string | null = null let currentOutputStartedAt: number | null = null + let currentOutputReceivedMs = 0 let micChunks = 0 let playbackChunks = 0 let controlEvents = 0 @@ -162,11 +182,21 @@ export async function startRealtimeAudioSession({ currentOutputStartedAt = null } + const setCurrentOutputItem = (itemId: string): void => { + if (currentOutputItemId === itemId) return + currentOutputItemId = itemId + currentOutputStartedAt = null + currentOutputReceivedMs = 0 + } + const interruptPlayback = (): void => { const itemId = currentOutputItemId - if (!itemId) return + if (!itemId) { + stopScheduledPlayback() + return + } - const audioEndMs = + const playedMs = currentOutputStartedAt === null ? 0 : Math.max( @@ -175,7 +205,14 @@ export async function startRealtimeAudioSession({ (playbackContext.currentTime - currentOutputStartedAt) * 1000 ) ) + const maxGeneratedMs = Math.max( + 0, + Math.floor(currentOutputReceivedMs - TRUNCATE_SAFETY_MS) + ) + const audioEndMs = Math.min(playedMs, maxGeneratedMs) stopScheduledPlayback() + if (audioEndMs <= 0) return + void appendControl({ type: `output_audio.truncate`, itemId, @@ -190,6 +227,7 @@ export async function startRealtimeAudioSession({ processor?.disconnect() silentOutput?.disconnect() source?.disconnect() + onInputLevel?.(0) for (const track of media?.getTracks() ?? []) track.stop() stopScheduledPlayback() await appendQueue.catch(() => undefined) @@ -247,6 +285,7 @@ export async function startRealtimeAudioSession({ if (abort.signal.aborted) return const input = event.inputBuffer.getChannelData(0) const bytes = pcm16Bytes(input) + onInputLevel?.(audioLevel(input)) micChunks += 1 if (micChunks === 1) { console.info( @@ -296,7 +335,7 @@ export async function startRealtimeAudioSession({ playbackContext.currentTime, nextPlaybackTime ) - if (currentOutputStartedAt === null) { + if (currentOutputItemId && currentOutputStartedAt === null) { currentOutputStartedAt = startAt } node.start(startAt) @@ -333,7 +372,10 @@ export async function startRealtimeAudioSession({ event.type === `output_audio.delta` && typeof event.itemId === `string` ) { - currentOutputItemId = event.itemId + setCurrentOutputItem(event.itemId) + if (typeof event.byteLength === `number`) { + currentOutputReceivedMs += pcm16DurationMs(event.byteLength) + } } else if (event.type === `input_audio.speech_started`) { interruptPlayback() } From 2f2449c6c608f7af407b951b5342640333533d42 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 14:12:56 +0100 Subject: [PATCH 16/31] feat(agents): persist realtime transcripts --- .../agents-runtime/src/context-factory.ts | 153 +++++++++++++++++- .../agents-runtime/src/entity-timeline.ts | 128 ++++++++++++++- .../agents-runtime/src/openai-realtime.ts | 17 ++ .../agents-runtime/src/timeline-context.ts | 31 ++++ packages/agents-runtime/src/types.ts | 16 ++ .../test/openai-realtime.test.ts | 71 ++++++++ .../test/realtime-context.test.ts | 65 ++++++++ .../test/timeline-context.test.ts | 37 +++++ .../src/components/EntityTimeline.tsx | 36 ++++- 9 files changed, 545 insertions(+), 9 deletions(-) diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 7fbec9f533..2c2bac123a 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -1276,6 +1276,7 @@ export function createHandlerContext( let currentToolCall: | { toolCallId: string; name: string; args: unknown } | undefined + const realtimeSession = activeRealtimeSession() const endText = (): void => { if (!textStarted) return @@ -1292,6 +1293,125 @@ export function createHandlerContext( bridge.onTextDelta(delta) } + const transcriptTextByKey = new Map() + const transcriptCreatedAtByKey = new Map() + const transcriptFallbackIds = new Map<`input` | `output`, string>() + let transcriptFallbackCounter = 0 + let providerSessionId = realtimeSession?.id + + const currentTranscriptSessionId = (): string => + realtimeSession?.id ?? providerSessionId ?? `ephemeral` + + const transcriptKey = ( + direction: `input` | `output`, + id?: string + ): string => { + let stableId = id + if (!stableId) { + stableId = transcriptFallbackIds.get(direction) + if (!stableId) { + stableId = `fallback-${transcriptFallbackCounter}` + transcriptFallbackCounter += 1 + transcriptFallbackIds.set(direction, stableId) + } + } + return `realtime-transcript:${currentTranscriptSessionId()}:${direction}:${stableId}` + } + + const writeRealtimeTranscript = (input: { + direction: `input` | `output` + key: string + text: string + status: `partial` | `final` + turnId?: string + responseId?: string + }): void => { + const collection = config.db.collections.realtimeTranscripts + if (input.text.length === 0 && !collection.has(input.key)) return + + const existing = collection.get(input.key) as + | { created_at?: string } + | undefined + const createdAt = + transcriptCreatedAtByKey.get(input.key) ?? + existing?.created_at ?? + new Date().toISOString() + transcriptCreatedAtByKey.set(input.key, createdAt) + + const value = { + session_id: currentTranscriptSessionId(), + direction: input.direction, + text: input.text, + status: input.status, + audio_stream: input.direction, + ...(input.turnId ? { turn_id: input.turnId } : {}), + ...(input.responseId ? { response_id: input.responseId } : {}), + created_at: createdAt, + } + config.writeEvent( + (collection.has(input.key) + ? entityStateSchema.realtimeTranscripts.update({ + key: input.key, + value: value as never, + }) + : entityStateSchema.realtimeTranscripts.insert({ + key: input.key, + value: value as never, + })) as ChangeEvent + ) + } + + const appendRealtimeTranscript = (input: { + direction: `input` | `output` + delta: string + turnId?: string + responseId?: string + }): void => { + if (input.delta.length === 0) return + const key = transcriptKey( + input.direction, + input.direction === `input` ? input.turnId : input.responseId + ) + const text = `${transcriptTextByKey.get(key) ?? ``}${input.delta}` + transcriptTextByKey.set(key, text) + writeRealtimeTranscript({ + direction: input.direction, + key, + text, + status: `partial`, + turnId: input.turnId, + responseId: input.responseId, + }) + } + + const completeRealtimeTranscript = (input: { + direction: `input` | `output` + text?: string + turnId?: string + responseId?: string + }): void => { + const key = transcriptKey( + input.direction, + input.direction === `input` ? input.turnId : input.responseId + ) + const text = input.text ?? transcriptTextByKey.get(key) ?? `` + transcriptTextByKey.set(key, text) + writeRealtimeTranscript({ + direction: input.direction, + key, + text, + status: `final`, + turnId: input.turnId, + responseId: input.responseId, + }) + if ( + (input.direction === `input` && !input.turnId) || + (input.direction === `output` && !input.responseId) + ) { + transcriptFallbackIds.delete(input.direction) + } + } + const composedTools = (await composeToolsWithProviders( activeRealtimeConfig.tools ?? [] )) as Array @@ -1303,7 +1423,6 @@ export function createHandlerContext( timelineToMessages(config.db) ) let realtimeIo: RealtimeStreamIo | undefined - const realtimeSession = activeRealtimeSession() let realtimeSessionTerminalWritten = false async function handleProviderEvent( @@ -1311,17 +1430,35 @@ export function createHandlerContext( ): Promise { switch (event.type) { case `session.started`: + providerSessionId = + realtimeSession?.id ?? event.sessionId ?? providerSessionId + break + case `session.updated`: case `input_audio.speech_started`: case `input_audio.speech_stopped`: - case `input_transcript.delta`: - case `input_transcript.completed`: case `output_audio.delta`: case `output_audio.completed`: case `response.started`: case `response.cancelled`: break + case `input_transcript.delta`: + appendRealtimeTranscript({ + direction: `input`, + delta: event.delta, + turnId: event.turnId, + }) + break + + case `input_transcript.completed`: + completeRealtimeTranscript({ + direction: `input`, + text: event.text, + turnId: event.turnId, + }) + break + case `session.closed`: case `response.completed`: endText() @@ -1351,10 +1488,20 @@ export function createHandlerContext( ) case `output_transcript.delta`: + appendRealtimeTranscript({ + direction: `output`, + delta: event.delta, + responseId: event.responseId, + }) emitText(event.delta) break case `output_transcript.completed`: + completeRealtimeTranscript({ + direction: `output`, + text: event.text, + responseId: event.responseId, + }) if (!textStarted && event.text) { emitText(event.text) } diff --git a/packages/agents-runtime/src/entity-timeline.ts b/packages/agents-runtime/src/entity-timeline.ts index 0520982298..d3cc575898 100644 --- a/packages/agents-runtime/src/entity-timeline.ts +++ b/packages/agents-runtime/src/entity-timeline.ts @@ -19,7 +19,12 @@ import type { } from '@tanstack/db' import type { EntityStreamDB } from './entity-stream-db' import { formatPointerOrderToken, type EventPointer } from './event-pointer' -import type { ChildStatusEntry, MessageReceived, Signal } from './entity-schema' +import type { + ChildStatusEntry, + MessageReceived, + RealtimeTranscript, + Signal, +} from './entity-schema' import type { ManifestEntry, Wake, WakeMessage } from './types' export type EntityTimelineState = @@ -133,6 +138,13 @@ export type IncludesSignal = Omit & { order: TimelineOrder } +export type IncludesRealtimeTranscript = Omit< + RealtimeTranscript, + `_seq` | `_timeline_order` +> & { + order: TimelineOrder +} + export interface IncludesContextInserted { key: string order: TimelineOrder @@ -169,6 +181,7 @@ export interface EntityTimelineData { inbox: Array wakes: Array signals: Array + realtimeTranscripts?: Array contextInserted: Array contextRemoved: Array entities: Array @@ -250,6 +263,7 @@ export interface EntityTimelineRunRow { export type EntityTimelineInboxRow = IncludesInboxMessage export type EntityTimelineWakeRow = IncludesWakeMessage export type EntityTimelineSignalRow = IncludesSignal +export type EntityTimelineRealtimeTranscriptRow = IncludesRealtimeTranscript export type EntityTimelineQueryRow = | { @@ -258,6 +272,7 @@ export type EntityTimelineQueryRow = run?: undefined wake?: undefined signal?: undefined + realtimeTranscript?: undefined manifest?: undefined } | { @@ -266,6 +281,7 @@ export type EntityTimelineQueryRow = run: EntityTimelineRunRow wake?: undefined signal?: undefined + realtimeTranscript?: undefined manifest?: undefined } | { @@ -274,6 +290,7 @@ export type EntityTimelineQueryRow = run?: undefined wake: EntityTimelineWakeRow signal?: undefined + realtimeTranscript?: undefined manifest?: undefined } | { @@ -282,6 +299,16 @@ export type EntityTimelineQueryRow = run?: undefined wake?: undefined signal: EntityTimelineSignalRow + realtimeTranscript?: undefined + manifest?: undefined + } + | { + $key: string + inbox?: undefined + run?: undefined + wake?: undefined + signal?: undefined + realtimeTranscript: EntityTimelineRealtimeTranscriptRow manifest?: undefined } | { @@ -290,6 +317,7 @@ export type EntityTimelineQueryRow = run?: undefined wake?: undefined signal?: undefined + realtimeTranscript?: undefined manifest: ManifestEntry } @@ -405,6 +433,9 @@ export function normalizeEntityTimelineData( inbox: data.inbox, wakes: data.wakes, signals: data.signals ?? [], + realtimeTranscripts: [...(data.realtimeTranscripts ?? [])].sort( + compareTimelineOrder + ), contextInserted: data.contextInserted, contextRemoved: data.contextRemoved, entities: normalizeTimelineEntities(data.entities), @@ -441,6 +472,9 @@ type WakeRow = OrderedValue< type SignalRow = OrderedValue< EntityStreamDB[`collections`][`signals`][`toArray`][number] > +type RealtimeTranscriptValueRow = + EntityStreamDB[`collections`][`realtimeTranscripts`][`toArray`][number] +type RealtimeTranscriptRow = OrderedValue type ContextInsertedValueRow = EntityStreamDB[`collections`][`contextInserted`][`toArray`][number] type ContextRemovedValueRow = @@ -853,6 +887,22 @@ function buildSignalMessages(signals: Array): Array { }) } +function buildRealtimeTranscriptMessages( + transcripts: Array +): Array { + return [...transcripts].sort(compareTimelineOrder).map((transcript) => { + const { + _seq: _ignoredSeq, + _timeline_order: _ignoredTimelineOrder, + ...value + } = transcript + return { + ...value, + order: transcript.order, + } + }) +} + function buildContextInsertedMessages( entries: Array ): Array { @@ -971,6 +1021,14 @@ export function buildEntityTimelineData( const inbox = withOrderToken(db.collections.inbox) const wakes = withOrderToken(db.collections.wakes) const signals = withOrderToken(db.collections.signals) + const realtimeTranscripts = withOrderToken( + getOrderableCollection( + db.collections.realtimeTranscripts as + | typeof db.collections.realtimeTranscripts + | undefined, + `realtimeTranscripts` + ) + ) const contextInserted = withOrderToken( getOrderableCollection( db.collections.contextInserted as @@ -1018,6 +1076,7 @@ export function buildEntityTimelineData( inbox, wakes, signals, + realtimeTranscripts, contextInserted, contextRemoved, manifests.filter(hasOrderToken), @@ -1035,6 +1094,9 @@ export function buildEntityTimelineData( inbox: buildInboxMessages(withOrderFromOrderIndex(inbox, orderIndex)), wakes: buildWakeMessages(withOrderFromOrderIndex(wakes, orderIndex)), signals: buildSignalMessages(withOrderFromOrderIndex(signals, orderIndex)), + realtimeTranscripts: buildRealtimeTranscriptMessages( + withOrderFromOrderIndex(realtimeTranscripts, orderIndex) + ), contextInserted: buildContextInsertedMessages( withOrderAndHistoryOffsetFromOrderIndex(contextInserted, orderIndex) ), @@ -1285,6 +1347,28 @@ function buildEntityTimelineQuery( new_state: signal.new_state, })) + const realtimeTranscriptSource = q + .from({ realtimeTranscript: db.collections.realtimeTranscripts }) + .where(({ realtimeTranscript }) => + eq(realtimeTranscript.direction, `input`) + ) + .select(({ realtimeTranscript }) => ({ + key: realtimeTranscript.key, + order: coalesce(realtimeTranscript._timeline_order, `~`), + session_id: realtimeTranscript.session_id, + direction: realtimeTranscript.direction, + text: realtimeTranscript.text, + status: realtimeTranscript.status, + turn_id: realtimeTranscript.turn_id, + response_id: realtimeTranscript.response_id, + audio_stream: realtimeTranscript.audio_stream, + audio_offset: realtimeTranscript.audio_offset, + audio_next_offset: realtimeTranscript.audio_next_offset, + sample_start: realtimeTranscript.sample_start, + sample_end: realtimeTranscript.sample_end, + created_at: realtimeTranscript.created_at, + })) + const runItemsSource = q .unionAll({ text: db.collections.texts, @@ -1382,30 +1466,41 @@ function buildEntityTimelineQuery( run: runSource, wake: wakeSource, signal: signalSource, + realtimeTranscript: realtimeTranscriptSource, manifest: db.collections.manifests, }) - .orderBy(({ inbox, run, wake, signal, manifest }) => + .orderBy(({ inbox, run, wake, signal, realtimeTranscript, manifest }) => coalesce( inbox.order, run.order, wake.order, signal.order, + realtimeTranscript.order, manifest._timeline_order, `~` ) ) - .orderBy(({ inbox, run, wake, signal, manifest }) => + .orderBy(({ inbox, run, wake, signal, realtimeTranscript, manifest }) => coalesce( caseWhen(inbox.key, `inbox`), caseWhen(run.key, `run`), caseWhen(wake.key, `wake`), caseWhen(signal.key, `signal`), + caseWhen(realtimeTranscript.key, `realtimeTranscript`), caseWhen(manifest.key, `manifest`), `` ) ) - .orderBy(({ inbox, run, wake, signal, manifest }) => - coalesce(inbox.key, run.key, wake.key, signal.key, manifest.key, ``) + .orderBy(({ inbox, run, wake, signal, realtimeTranscript, manifest }) => + coalesce( + inbox.key, + run.key, + wake.key, + signal.key, + realtimeTranscript.key, + manifest.key, + `` + ) ) } @@ -1558,6 +1653,29 @@ export function createEntityIncludesQuery( new_state: signal.new_state, })) ), + realtimeTranscripts: toArray( + q + .from({ realtimeTranscript: db.collections.realtimeTranscripts }) + .orderBy(({ realtimeTranscript }) => + coalesce(realtimeTranscript._seq, -1) + ) + .select(({ realtimeTranscript }) => ({ + key: realtimeTranscript.key, + order: coalesce(realtimeTranscript._seq, -1), + session_id: realtimeTranscript.session_id, + direction: realtimeTranscript.direction, + text: realtimeTranscript.text, + status: realtimeTranscript.status, + turn_id: realtimeTranscript.turn_id, + response_id: realtimeTranscript.response_id, + audio_stream: realtimeTranscript.audio_stream, + audio_offset: realtimeTranscript.audio_offset, + audio_next_offset: realtimeTranscript.audio_next_offset, + sample_start: realtimeTranscript.sample_start, + sample_end: realtimeTranscript.sample_end, + created_at: realtimeTranscript.created_at, + })) + ), entities: toArray( q .from({ entity: entitiesCollection }) diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts index 7914c95cc3..c5d827e883 100644 --- a/packages/agents-runtime/src/openai-realtime.ts +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -31,6 +31,7 @@ type OpenAIRealtimeWebSocketConstructor = new ( ) => OpenAIRealtimeSocket const DEFAULT_OPENAI_REALTIME_MODEL = `gpt-realtime` +const DEFAULT_OPENAI_INPUT_TRANSCRIPTION_MODEL = `gpt-4o-mini-transcribe` export interface OpenAIRealtimeProviderOptions { apiKey: string | (() => MaybePromise) @@ -276,12 +277,27 @@ function realtimeFormat( } } +function inputTranscription( + input: RealtimeProviderConnectInput +): Record | undefined { + if (!input.audio?.inputFormat || input.audio.inputTranscription === false) { + return undefined + } + const config = input.audio.inputTranscription ?? {} + return { + model: config.model ?? DEFAULT_OPENAI_INPUT_TRANSCRIPTION_MODEL, + ...(config.language ? { language: config.language } : {}), + ...(config.prompt ? { prompt: config.prompt } : {}), + } +} + function buildSessionUpdate( opts: OpenAIRealtimeProviderOptions, input: RealtimeProviderConnectInput ): Record { const inputFormat = realtimeFormat(input.audio?.inputFormat) const outputFormat = realtimeFormat(input.audio?.outputFormat) + const transcription = inputTranscription(input) return { type: `session.update`, session: { @@ -300,6 +316,7 @@ function buildSessionUpdate( ? { input: { format: inputFormat, + ...(transcription ? { transcription } : {}), turn_detection: { type: `server_vad`, threshold: 0.5, diff --git a/packages/agents-runtime/src/timeline-context.ts b/packages/agents-runtime/src/timeline-context.ts index a416e14d32..c3506bdf29 100644 --- a/packages/agents-runtime/src/timeline-context.ts +++ b/packages/agents-runtime/src/timeline-context.ts @@ -7,6 +7,7 @@ import type { IncludesContextInserted, IncludesContextRemoved, IncludesInboxMessage, + IncludesRealtimeTranscript, IncludesRun, IncludesSignal, IncludesWakeMessage, @@ -69,12 +70,14 @@ export function buildTimelineMessages(input: { inbox: Array wakes?: Array signals?: Array + realtimeTranscripts?: Array }): Array { return materializeTimeline({ runs: input.runs, inbox: input.inbox, wakes: input.wakes ?? [], signals: input.signals ?? [], + realtimeTranscripts: input.realtimeTranscripts ?? [], contextInserted: [], contextRemoved: [], entities: [], @@ -223,6 +226,11 @@ export function defaultProjection( case `signal`: return [renderSignalMessage(item.signal)] + case `realtime_transcript`: + return item.direction === `input` && item.text.length > 0 + ? [{ role: `user`, content: item.text }] + : null + case `run`: { const messages: Array = [] @@ -357,6 +365,11 @@ export function materializeTimeline( | { kind: `inbox`; order: TimelineOrder; item: IncludesInboxMessage } | { kind: `wake`; order: TimelineOrder; item: IncludesWakeMessage } | { kind: `signal`; order: TimelineOrder; item: IncludesSignal } + | { + kind: `realtime_transcript` + order: TimelineOrder + item: IncludesRealtimeTranscript + } | { kind: `run`; order: TimelineOrder; item: IncludesRun } | { kind: `context_inserted` @@ -387,6 +400,13 @@ export function materializeTimeline( order: item.order, item, })), + ...(data.realtimeTranscripts ?? []) + .filter((item) => item.direction === `input` && item.text.length > 0) + .map((item) => ({ + kind: `realtime_transcript` as const, + order: item.order, + item, + })), ...data.runs.map((item) => ({ kind: `run` as const, order: item.order, @@ -445,6 +465,17 @@ export function materializeTimeline( signal: entry.item, } + case `realtime_transcript`: + return { + kind: `realtime_transcript`, + at: orderToOffset(entry.order), + key: entry.item.key, + sessionId: entry.item.session_id, + direction: entry.item.direction, + text: entry.item.text, + status: entry.item.status, + } + case `run`: return materializeRunItem(entry.item) diff --git a/packages/agents-runtime/src/types.ts b/packages/agents-runtime/src/types.ts index 53d875f4e8..3cd06eef52 100644 --- a/packages/agents-runtime/src/types.ts +++ b/packages/agents-runtime/src/types.ts @@ -382,6 +382,15 @@ export type TimelineItem = } | { kind: `wake`; at: number; payload: unknown } | { kind: `signal`; at: number; signal: EntitySignalEntry } + | { + kind: `realtime_transcript` + at: number + key: string + sessionId: string + direction: `input` | `output` + text: string + status: `partial` | `final` + } | { kind: `run` at: number @@ -939,9 +948,16 @@ export interface RealtimeAudioFormat { channels: number } +export interface RealtimeInputTranscriptionConfig { + model?: string + language?: string + prompt?: string +} + export interface RealtimeAudioConfig { inputFormat?: RealtimeAudioFormat outputFormat?: RealtimeAudioFormat + inputTranscription?: false | RealtimeInputTranscriptionConfig } export interface RealtimeToolPolicy { diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts index 9de8bf4f07..163a246315 100644 --- a/packages/agents-runtime/test/openai-realtime.test.ts +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -102,6 +102,7 @@ describe(`createOpenAIRealtimeProvider`, () => { audio: { input: { format: { type: `audio/pcm`, rate: 24_000 }, + transcription: { model: `gpt-4o-mini-transcribe` }, turn_detection: { type: `server_vad`, threshold: 0.5, @@ -125,6 +126,38 @@ describe(`createOpenAIRealtimeProvider`, () => { }) }) + it(`can disable input audio transcription`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + inputTranscription: false, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.sent[0]).toMatchObject({ + session: { + audio: { + input: { + format: { type: `audio/pcm`, rate: 24_000 }, + }, + }, + }, + }) + expect( + (socket.sent[0] as any).session.audio.input.transcription + ).toBeUndefined() + }) + it(`sends audio input chunks as OpenAI input buffer events`, async () => { FakeWebSocket.instances = [] const provider = createOpenAIRealtimeProvider({ @@ -278,6 +311,44 @@ describe(`createOpenAIRealtimeProvider`, () => { }) }) + it(`maps GA input audio transcript events`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ + type: `conversation.item.input_audio_transcription.delta`, + item_id: `item-1`, + delta: `hello`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_transcript.delta`, + turnId: `item-1`, + delta: `hello`, + }) + + socket.emitMessage({ + type: `conversation.item.input_audio_transcription.completed`, + item_id: `item-1`, + transcript: `hello there`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_transcript.completed`, + turnId: `item-1`, + text: `hello there`, + }) + }) + it(`maps OpenAI events and executes function calls`, async () => { FakeWebSocket.instances = [] const execute = vi.fn().mockResolvedValue({ diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index 0c7109d46c..c80e4c61b3 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -65,6 +65,71 @@ describe(`ctx.useRealtime()`, () => { ]) }) + it(`persists realtime input and output transcripts`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `provider-session` }, + { + type: `input_transcript.delta`, + delta: `hel`, + turnId: `input-item-1`, + }, + { + type: `input_transcript.delta`, + delta: `lo`, + turnId: `input-item-1`, + }, + { + type: `input_transcript.completed`, + text: `hello there`, + turnId: `input-item-1`, + }, + { + type: `output_transcript.delta`, + delta: `Hi`, + responseId: `resp-1`, + }, + { + type: `output_transcript.completed`, + text: `Hi there`, + responseId: `resp-1`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await realtime.run() + + expect(ctx.db.collections.realtimeTranscripts.toArray).toMatchObject([ + { + key: `realtime-transcript:provider-session:input:input-item-1`, + session_id: `provider-session`, + direction: `input`, + text: `hello there`, + status: `final`, + turn_id: `input-item-1`, + audio_stream: `input`, + created_at: expect.any(String), + }, + { + key: `realtime-transcript:provider-session:output:resp-1`, + session_id: `provider-session`, + direction: `output`, + text: `Hi there`, + status: `final`, + response_id: `resp-1`, + audio_stream: `output`, + created_at: expect.any(String), + }, + ]) + }) + it(`finds active realtime sessions from the manifest`, () => { const { ctx } = createTestHandlerContext() diff --git a/packages/agents-runtime/test/timeline-context.test.ts b/packages/agents-runtime/test/timeline-context.test.ts index 0370ca1b1c..6036c3db0a 100644 --- a/packages/agents-runtime/test/timeline-context.test.ts +++ b/packages/agents-runtime/test/timeline-context.test.ts @@ -6,6 +6,7 @@ import { import type { EntityStreamDB } from '../src/entity-stream-db' import type { IncludesInboxMessage, + IncludesRealtimeTranscript, IncludesRun, IncludesSignal, IncludesWakeMessage, @@ -172,6 +173,40 @@ describe(`timeline context`, () => { expect(result).toEqual([{ role: `user`, content: `updated text` }]) }) + it(`projects realtime input transcripts without duplicating output transcripts`, () => { + const realtimeTranscripts: Array = [ + { + key: `rt-in`, + order: order(1), + session_id: `rt-1`, + direction: `input`, + text: `voice question`, + status: `final`, + audio_stream: `input`, + created_at: `2026-03-28T00:00:00.000Z`, + }, + { + key: `rt-out`, + order: order(2), + session_id: `rt-1`, + direction: `output`, + text: `voice answer`, + status: `final`, + audio_stream: `output`, + created_at: `2026-03-28T00:00:01.000Z`, + }, + ] + + expect( + buildTimelineMessages({ + runs: [], + inbox: [], + wakes: [], + realtimeTranscripts, + }) + ).toEqual([{ role: `user`, content: `voice question` }]) + }) + it(`buildTimelineMessages keeps pending tool calls without emitting tool results`, () => { expect( buildTimelineMessages({ @@ -494,6 +529,7 @@ describe(`timeline context`, () => { __electricRowOffsets: new Map([[`wake-1`, offset(7)]]), }, signals: { toArray: [], __electricRowOffsets: new Map() }, + realtimeTranscripts: { toArray: [], __electricRowOffsets: new Map() }, contextInserted: { toArray: [], __electricRowOffsets: new Map() }, contextRemoved: { toArray: [], __electricRowOffsets: new Map() }, manifests: { toArray: [], __electricRowOffsets: new Map() }, @@ -536,6 +572,7 @@ describe(`timeline context`, () => { inbox: { toArray: [] }, wakes: { toArray: [] }, signals: { toArray: [] }, + realtimeTranscripts: { toArray: [] }, contextInserted: { toArray: [] }, contextRemoved: { toArray: [] }, manifests: { toArray: [] }, diff --git a/packages/agents-server-ui/src/components/EntityTimeline.tsx b/packages/agents-server-ui/src/components/EntityTimeline.tsx index fe79c0dd52..963d42c006 100644 --- a/packages/agents-server-ui/src/components/EntityTimeline.tsx +++ b/packages/agents-server-ui/src/components/EntityTimeline.tsx @@ -243,6 +243,13 @@ function estimateRowHeight( ) return Math.max(64, 48 + lines * lineHeight) + timelineRowGap(row) } + if (row.realtimeTranscript) { + const lines = Math.max( + 1, + Math.ceil(row.realtimeTranscript.text.length / charsPerLine) + ) + return Math.max(64, 48 + lines * lineHeight) + timelineRowGap(row) + } if (row.wake || row.signal || row.manifest) { return 76 + timelineRowGap(row) } @@ -271,6 +278,7 @@ function timelineRowSearchText( runSearchTextByKey: Map ): string { if (row.inbox) return readInboxText(row.inbox.payload) + if (row.realtimeTranscript) return row.realtimeTranscript.text if (row.wake) { return wakeSectionText({ kind: `wake`, @@ -286,6 +294,7 @@ function timelineRowSearchText( function timelineRowLabel(row: RenderTimelineRow): string { if (row.inbox?.from_agent) return `Agent message` if (row.inbox) return `User message` + if (row.realtimeTranscript) return `Voice message` if (row.wake) return `Wake` if (row.signal) return `Signal` if (row.manifest) return `Manifest item` @@ -979,6 +988,28 @@ const TimelineRow = memo(function TimelineRow({ ) } + if (row.realtimeTranscript) { + const timestamp = Date.parse(row.realtimeTranscript.created_at) + return ( + + ) + } + if (row.wake) { return ( = 0; index--) { const row = displayRows[index] - if (row?.inbox) { + if (row?.inbox || row?.realtimeTranscript) { return row.$key } } @@ -1235,6 +1266,9 @@ export function EntityTimeline({ if (row.inbox) { const timestamp = Date.parse(row.inbox.timestamp) lastUserTimestamp = Number.isFinite(timestamp) ? timestamp : null + } else if (row.realtimeTranscript) { + const timestamp = Date.parse(row.realtimeTranscript.created_at) + lastUserTimestamp = Number.isFinite(timestamp) ? timestamp : null } else if (row.run) { timestampByRowKey.set(row.$key, lastUserTimestamp) } From 28ecd7fcb56f77325637443911dc3b7050df155a Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Tue, 9 Jun 2026 14:23:03 +0100 Subject: [PATCH 17/31] feat(agents-ui): start realtime from spawn screen --- .../src/components/EntityContextDrawer.tsx | 2 + .../src/components/EntityTimeline.tsx | 4 + .../src/components/MessageInput.tsx | 22 +++ .../src/components/NewSessionPage.module.css | 30 ++++ .../src/components/views/ChatView.tsx | 21 +++ .../src/components/views/NewSessionView.tsx | 136 +++++++++++++++--- 6 files changed, 196 insertions(+), 19 deletions(-) diff --git a/packages/agents-server-ui/src/components/EntityContextDrawer.tsx b/packages/agents-server-ui/src/components/EntityContextDrawer.tsx index 7f93db305d..72d1f04fee 100644 --- a/packages/agents-server-ui/src/components/EntityContextDrawer.tsx +++ b/packages/agents-server-ui/src/components/EntityContextDrawer.tsx @@ -570,6 +570,7 @@ function manifestKindLabel(manifest: Manifest): string { case `schedule`: return manifest.scheduleType === `cron` ? `Cron schedule` : `Future send` } + return manifest.kind } function createParentEntry(parent: DrawerEntity): DrawerEntry { @@ -707,6 +708,7 @@ function createManifestEntry( entity: null, } } + return null } function describeSourceConfig(config: unknown): string { diff --git a/packages/agents-server-ui/src/components/EntityTimeline.tsx b/packages/agents-server-ui/src/components/EntityTimeline.tsx index 963d42c006..d70d64ccf3 100644 --- a/packages/agents-server-ui/src/components/EntityTimeline.tsx +++ b/packages/agents-server-ui/src/components/EntityTimeline.tsx @@ -754,6 +754,7 @@ function manifestKindLabel(manifest: Manifest): string { case `schedule`: return `Schedule` } + return manifest.kind } function manifestTitle(manifest: Manifest): string { @@ -769,6 +770,7 @@ function manifestTitle(manifest: Manifest): string { case `schedule`: return manifest.id } + return manifest.key } function manifestMeta(manifest: Manifest): string { @@ -790,6 +792,7 @@ function manifestMeta(manifest: Manifest): string { ? `${manifest.expression}${manifest.timezone ? ` · ${manifest.timezone}` : ``}` : `${manifest.fireAt} · ${manifest.status}` } + return `` } function manifestDetails( @@ -848,6 +851,7 @@ function manifestDetails( { label: `Status`, value: manifest.status ?? `pending` }, ] } + return [] } function manifestIcon(manifest: Manifest) { diff --git a/packages/agents-server-ui/src/components/MessageInput.tsx b/packages/agents-server-ui/src/components/MessageInput.tsx index a05ca7cafa..7602947ae8 100644 --- a/packages/agents-server-ui/src/components/MessageInput.tsx +++ b/packages/agents-server-ui/src/components/MessageInput.tsx @@ -49,6 +49,8 @@ export function MessageInput({ drawer, onSend, onStop, + autoStartRealtimeSignal, + onRealtimeAutoStartConsumed, }: { db: EntityStreamDBWithActions | null baseUrl: string @@ -66,6 +68,8 @@ export function MessageInput({ onOptimisticQueuedMessage?: (message: OptimisticInboxMessage) => void onSend?: () => void onStop?: () => void + autoStartRealtimeSignal?: string | null + onRealtimeAutoStartConsumed?: () => void /** * Optional content rendered above the composer, sharing its docked * width and lift into the timeline above. The composer is z-indexed @@ -93,6 +97,7 @@ export function MessageInput({ const [realtimeActive, setRealtimeActive] = useState(false) const [realtimeInputLevel, setRealtimeInputLevel] = useState(0) const realtimeSessionRef = useRef(null) + const handledAutoStartRealtimeRef = useRef(null) const composerFocusRef = useRef<{ focus: () => void } | null>(null) const inputDisabled = disabled || writeDisabled const attachmentsDisabled = @@ -284,6 +289,23 @@ export function MessageInput({ }) }, [baseUrl, canUseRealtime, entityUrl, realtimePending]) + useEffect(() => { + if (!autoStartRealtimeSignal) return + if (handledAutoStartRealtimeRef.current === autoStartRealtimeSignal) return + if (!canUseRealtime || realtimePending) return + handledAutoStartRealtimeRef.current = autoStartRealtimeSignal + onRealtimeAutoStartConsumed?.() + if (!realtimeSessionRef.current) { + handleRealtimeToggle() + } + }, [ + autoStartRealtimeSignal, + canUseRealtime, + handleRealtimeToggle, + onRealtimeAutoStartConsumed, + realtimePending, + ]) + const startEditing = useCallback( (message: EntityTimelineData[`inbox`][number]) => { if (inputDisabled) return diff --git a/packages/agents-server-ui/src/components/NewSessionPage.module.css b/packages/agents-server-ui/src/components/NewSessionPage.module.css index 70106efecc..241c4d76e1 100644 --- a/packages/agents-server-ui/src/components/NewSessionPage.module.css +++ b/packages/agents-server-ui/src/components/NewSessionPage.module.css @@ -470,6 +470,36 @@ display: inline-flex; } +.composerVoice { + all: unset; + display: inline-flex; + align-items: center; + justify-content: center; + width: 24px; + height: 24px; + border-radius: var(--ds-radius-full); + background: var(--ds-gray-a3); + color: var(--ds-text-3); + cursor: pointer; + transition: + background 0.12s ease, + color 0.12s ease, + opacity 0.12s ease; + flex-shrink: 0; +} +.composerVoice:hover:not(:disabled) { + background: var(--ds-gray-a4); + color: var(--ds-text-1); +} +.composerVoice:disabled { + cursor: not-allowed; + opacity: 0.55; +} +.composerVoicePending { + background: var(--ds-accent-a3); + color: var(--ds-accent-11); +} + .composerSend { all: unset; display: inline-flex; diff --git a/packages/agents-server-ui/src/components/views/ChatView.tsx b/packages/agents-server-ui/src/components/views/ChatView.tsx index 0f64c6c239..e08b143f81 100644 --- a/packages/agents-server-ui/src/components/views/ChatView.tsx +++ b/packages/agents-server-ui/src/components/views/ChatView.tsx @@ -12,6 +12,7 @@ import { useEntityPermissions, type EntityPermission, } from '../../hooks/useEntityPermission' +import { useWorkspace } from '../../hooks/useWorkspace' import type { ViewProps } from '../../lib/workspace/viewRegistry' import type { EntityTimelineQueryRow } from '@electric-ax/agents-runtime/client' import type { EventPointer } from '@electric-ax/agents-runtime' @@ -40,6 +41,7 @@ export function ChatView({ entityStopped, isSpawning, tileId, + viewParams, }: ViewProps): React.ReactElement { // While `spawning`, the entity has no inbox yet — `connectUrl` is null // so `useEntityTimeline` doesn't try to subscribe and we render an empty @@ -54,6 +56,7 @@ export function ChatView({ entityStopped={entityStopped} isSpawning={isSpawning} tileId={tileId} + viewParams={viewParams} /> ) } @@ -178,6 +181,7 @@ function GenericChatBody({ entityStopped, isSpawning, tileId, + viewParams, }: { baseUrl: string entityUrl: string | null @@ -185,6 +189,7 @@ function GenericChatBody({ entityStopped: boolean isSpawning: boolean tileId: string + viewParams?: ViewProps[`viewParams`] }): React.ReactElement { const { timelineRows, @@ -197,6 +202,7 @@ function GenericChatBody({ } = useEntityTimeline(baseUrl || null, entityUrl) const { signalEntity, forkEntity, entityTypesCollection } = useElectricAgents() + const { helpers } = useWorkspace() const permissions = useEntityPermissions(entity, CHAT_VIEW_PERMISSIONS) const canWrite = permissions.write const canSignal = permissions.signal @@ -282,6 +288,19 @@ function GenericChatBody({ setStopPending(false) }, [entityUrl]) + const autoStartRealtimeSignal = + viewParams?.realtime === `start` && entityUrl + ? `${entityUrl}:realtime:start` + : null + const handleRealtimeAutoStartConsumed = useCallback(() => { + const nextParams = Object.fromEntries( + Object.entries(viewParams ?? {}).filter(([key]) => key !== `realtime`) + ) + helpers.setTileView(tileId, `chat`, { + viewParams: Object.keys(nextParams).length > 0 ? nextParams : undefined, + }) + }, [helpers, tileId, viewParams]) + const stopGeneration = useCallback(() => { if (!canSignal) return if (!entityUrl || !signalEntity || !generationActive || stopPending) return @@ -399,6 +418,8 @@ function GenericChatBody({ )} onSend={() => setSentMessageSignal((value) => value + 1)} onStop={stopGeneration} + autoStartRealtimeSignal={autoStartRealtimeSignal} + onRealtimeAutoStartConsumed={handleRealtimeAutoStartConsumed} /> ) diff --git a/packages/agents-server-ui/src/components/views/NewSessionView.tsx b/packages/agents-server-ui/src/components/views/NewSessionView.tsx index a7ab1ba345..89474db819 100644 --- a/packages/agents-server-ui/src/components/views/NewSessionView.tsx +++ b/packages/agents-server-ui/src/components/views/NewSessionView.tsx @@ -5,6 +5,7 @@ import { ChevronDown, ChevronRight, Cpu, + Mic, Sparkles, } from 'lucide-react' import { eq, not, useLiveQuery } from '@tanstack/react-db' @@ -55,6 +56,7 @@ import type { SlashCommandRow, } from '@electric-ax/agents-runtime/client' import type { StandaloneViewProps } from '../../lib/workspace/viewRegistry' +import type { TileViewParams } from '../../lib/workspace/types' /** * The "default agent" — when an entity type with this name is registered @@ -62,6 +64,7 @@ import type { StandaloneViewProps } from '../../lib/workspace/viewRegistry' * so the most common flow is one keystroke away. */ const DEFAULT_AGENT_NAME = `horton` +const REALTIME_AUTOSTART_VIEW_PARAMS: TileViewParams = { realtime: `start` } const HERO_TITLES = [ `Let’s ship`, @@ -342,7 +345,8 @@ export function NewSessionView({ initialMessage?: unknown, initialMessageType?: string, initialAttachments?: Array, - sandboxProfile?: string | null + sandboxProfile?: string | null, + viewParams?: TileViewParams ): Promise => { if (!spawnEntity) return false setError(null) @@ -400,6 +404,7 @@ export function NewSessionView({ } helpers.openEntity(entityUrl, { target: { tileId, position: `replace` }, + ...(viewParams ? { viewParams } : {}), }) return true } catch (err) { @@ -448,20 +453,15 @@ export function NewSessionView({ return () => setToolbarTitle(null) }, [handleCancelSelected, selected, setToolbarTitle]) - const handleStartDefault = useCallback( - async ( - input: string | ComposerInputPayload, + const prepareDefaultAgentArgs = useCallback( + ( args: Record, - attachments: Array, sandboxProfile: string | null - ): Promise => { - if (!defaultAgent) return false - // Inject the picker's choice into the spawn args for the composer flow - // only — non-default agents have their own schemas and may not - // understand `workingDirectory`. A remote sandbox runs in the provider - // VM, so a host working directory is meaningless there: skip it for - // remote profiles. The spawned session itself becomes the newest - // synced recent for this runner. + ): Record => { + // Inject the picker's choice into the spawn args for the default-agent + // composer only — non-default agents have their own schemas and may not + // understand `workingDirectory`. Remote sandboxes run in provider VMs, so + // host paths are meaningless there. const profileIsRemote = isSandboxProfileRemote( allSandboxProfiles, sandboxProfile @@ -470,7 +470,20 @@ export function NewSessionView({ // factory — require a (non-remote) profile or the arg is a no-op. const includeWorkingDir = workingDirectory !== null && sandboxProfile !== null && !profileIsRemote - const augmented = includeWorkingDir ? { ...args, workingDirectory } : args + return includeWorkingDir ? { ...args, workingDirectory } : args + }, + [allSandboxProfiles, workingDirectory] + ) + + const handleStartDefault = useCallback( + async ( + input: string | ComposerInputPayload, + args: Record, + attachments: Array, + sandboxProfile: string | null + ): Promise => { + if (!defaultAgent) return false + const augmented = prepareDefaultAgentArgs(args, sandboxProfile) const hasAttachments = attachments.length > 0 const initialMessage = typeof input === `string` @@ -491,7 +504,27 @@ export function NewSessionView({ sandboxProfile ) }, - [defaultAgent, doSpawn, workingDirectory, allSandboxProfiles] + [defaultAgent, doSpawn, prepareDefaultAgentArgs] + ) + + const handleStartDefaultRealtime = useCallback( + async ( + args: Record, + sandboxProfile: string | null + ): Promise => { + if (!defaultAgent) return false + const augmented = prepareDefaultAgentArgs(args, sandboxProfile) + return await doSpawn( + defaultAgent.name, + augmented, + undefined, + undefined, + undefined, + sandboxProfile, + REALTIME_AUTOSTART_VIEW_PARAMS + ) + }, + [defaultAgent, doSpawn, prepareDefaultAgentArgs] ) const defaultComposerReady = @@ -529,6 +562,7 @@ export function NewSessionView({ defaultAgentSandboxProfiles={defaultAgent ? allSandboxProfiles : []} onSelectType={handleSelectType} onStartDefault={handleStartDefault} + onStartDefaultRealtime={handleStartDefaultRealtime} spawnReady={Boolean(spawnEntity)} defaultComposerReady={defaultComposerReady} error={error} @@ -551,6 +585,7 @@ function Picker({ defaultAgentSandboxProfiles, onSelectType, onStartDefault, + onStartDefaultRealtime, spawnReady, defaultComposerReady, error, @@ -571,6 +606,10 @@ function Picker({ attachments: Array, sandboxProfile: string | null ) => Promise + onStartDefaultRealtime: ( + args: Record, + sandboxProfile: string | null + ) => Promise spawnReady: boolean defaultComposerReady: boolean error: string | null @@ -606,6 +645,7 @@ function Picker({ agent={defaultAgent} sandboxProfiles={defaultAgentSandboxProfiles} onSubmit={onStartDefault} + onStartRealtime={onStartDefaultRealtime} disabled={!defaultComposerReady} workingDirectory={workingDirectory} onChangeWorkingDirectory={onChangeWorkingDirectory} @@ -925,6 +965,7 @@ function DefaultAgentComposer({ agent, sandboxProfiles, onSubmit, + onStartRealtime, disabled, workingDirectory, onChangeWorkingDirectory, @@ -941,6 +982,10 @@ function DefaultAgentComposer({ attachments: Array, sandboxProfile: string | null ) => Promise + onStartRealtime: ( + args: Record, + sandboxProfile: string | null + ) => Promise disabled?: boolean workingDirectory: string | null onChangeWorkingDirectory: (path: string | null) => void @@ -960,7 +1005,11 @@ function DefaultAgentComposer({ [sandboxProfiles, selectedSandboxProfile] ) const [value, setValue] = useState(``) - const [submitting, setSubmitting] = useState(false) + const [submittingMode, setSubmittingMode] = useState< + `message` | `realtime` | null + >(null) + const submitting = submittingMode !== null + const realtimeSubmitting = submittingMode === `realtime` const composerFocusRef = useRef<{ focus: () => void } | null>(null) const inlineProps = useMemo( () => inlineSchemaProperties(agent.creation_schema), @@ -1059,7 +1108,7 @@ function DefaultAgentComposer({ payload ?? serializeComposerInput(value, slashCommands) const trimmed = nextPayload.source.trim() if ((!trimmed && files.length === 0) || disabled || submitting) return - setSubmitting(true) + setSubmittingMode(`message`) const cleaned: Record = {} for (const [k, v] of Object.entries(args)) { if (v !== undefined && v !== ``) cleaned[k] = v @@ -1078,7 +1127,7 @@ function DefaultAgentComposer({ }) .catch(() => undefined) .finally(() => { - setSubmitting(false) + setSubmittingMode(null) }) }, [ @@ -1095,6 +1144,29 @@ function DefaultAgentComposer({ ] ) + const startRealtime = useCallback(() => { + const files = imageAttachmentsEnabled ? attachments : [] + if (disabled || submitting || files.length > 0) return + setSubmittingMode(`realtime`) + const cleaned: Record = {} + for (const [k, v] of Object.entries(args)) { + if (v !== undefined && v !== ``) cleaned[k] = v + } + void onStartRealtime(cleaned, selectedSandboxProfile) + .catch(() => undefined) + .finally(() => { + setSubmittingMode(null) + }) + }, [ + args, + attachments, + disabled, + imageAttachmentsEnabled, + onStartRealtime, + selectedSandboxProfile, + submitting, + ]) + const attachmentCount = imageAttachmentsEnabled ? attachments.length : 0 const isActive = Boolean( (value.trim() || attachmentCount > 0) && !disabled && !submitting @@ -1103,6 +1175,12 @@ function DefaultAgentComposer({ const sendTooltip = submitting ? `Starting ${agent.name} session` : `Start ${agent.name} session` + const realtimeTooltip = + attachmentCount > 0 + ? `Remove attachments to start voice mode` + : realtimeSubmitting + ? `Starting voice session` + : `Start voice session` return (
{submitting && ( - Starting… + + {realtimeSubmitting ? `Starting voice…` : `Starting…`} + )} + + + + + diff --git a/packages/agents-server-ui/src/components/views/NewSessionView.tsx b/packages/agents-server-ui/src/components/views/NewSessionView.tsx index 89474db819..fe0410fa9a 100644 --- a/packages/agents-server-ui/src/components/views/NewSessionView.tsx +++ b/packages/agents-server-ui/src/components/views/NewSessionView.tsx @@ -1,11 +1,11 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react' import { ArrowUp, + AudioLines, Check, ChevronDown, ChevronRight, Cpu, - Mic, Sparkles, } from 'lucide-react' import { eq, not, useLiveQuery } from '@tanstack/react-db' @@ -1272,7 +1272,7 @@ function DefaultAgentComposer({ .filter(Boolean) .join(` `)} > - + diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index 4379d7ee5c..c4a47279fc 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -32,6 +32,28 @@ const MIC_CAPTURE_CHUNK_SAMPLES = 1024 const MIC_WORKLET_PROCESSOR_NAME = `realtime-mic-capture` const BYTES_PER_PCM16_SAMPLE = 2 const TRUNCATE_SAFETY_MS = 80 +const MIC_PRE_ROLL_MS = 360 +const MIC_VAD_TAIL_MS = 700 +const MIC_MAX_QUEUE_MS = 1600 +const MIC_APPEND_BATCH_MS = 60 +const MIC_APPEND_DRAIN_WAIT_MS = 350 +const MIC_MIN_START_LEVEL = 0.012 +const MIC_MIN_CONTINUE_LEVEL = 0.006 +const MIC_PLAYBACK_START_LEVEL = 0.035 +const MIC_START_CONFIRM_CHUNKS = 1 +const MIC_PLAYBACK_START_CONFIRM_CHUNKS = 4 +const MIC_NOISE_MARGIN_START = 0.01 +const MIC_NOISE_MARGIN_CONTINUE = 0.004 +const MIC_NOISE_FLOOR_INITIAL = 0.003 +const MIC_NOISE_FLOOR_MAX = 0.018 +const MIC_NOISE_FLOOR_ALPHA = 0.008 + +const NO_RETRY_BACKOFF = { + initialDelay: 100, + maxDelay: 100, + multiplier: 1, + maxRetries: 0, +} type MicCapture = { node: AudioNode @@ -65,6 +87,24 @@ function pcm16DurationMs(byteLength: number): number { return (byteLength / BYTES_PER_PCM16_SAMPLE / REALTIME_SAMPLE_RATE) * 1000 } +function durationBytes(durationMs: number): number { + return Math.ceil( + (durationMs / 1000) * REALTIME_SAMPLE_RATE * BYTES_PER_PCM16_SAMPLE + ) +} + +function combineChunks(chunks: Array): Uint8Array { + if (chunks.length === 1) return chunks[0]! + const totalLength = chunks.reduce((sum, chunk) => sum + chunk.byteLength, 0) + const combined = new Uint8Array(totalLength) + let offset = 0 + for (const chunk of chunks) { + combined.set(chunk, offset) + offset += chunk.byteLength + } + return combined +} + function audioLevel(input: Float32Array): number { if (input.length === 0) return 0 let sumSquares = 0 @@ -85,10 +125,41 @@ function pcm16Floats(bytes: Uint8Array): Float32Array { return output } +function alignedPcm16Chunk( + chunk: Uint8Array, + remainder: Uint8Array | undefined +): { bytes: Uint8Array; remainder: Uint8Array | undefined } { + const bytes = remainder ? combineChunks([remainder, chunk]) : chunk + const alignedLength = + bytes.byteLength - (bytes.byteLength % BYTES_PER_PCM16_SAMPLE) + + return { + bytes: + alignedLength === bytes.byteLength + ? bytes + : bytes.subarray(0, alignedLength), + remainder: + alignedLength === bytes.byteLength + ? undefined + : bytes.slice(alignedLength), + } +} + function jsonBytes(value: unknown): Uint8Array { return new TextEncoder().encode(JSON.stringify(value)) } +function delay(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) +} + +async function settleWithin( + promise: Promise, + timeoutMs: number +): Promise { + await Promise.race([promise, delay(timeoutMs)]) +} + function micWorkletSource(): string { return ` class RealtimeMicCaptureProcessor extends AudioWorkletProcessor { @@ -148,28 +219,18 @@ registerProcessor('${MIC_WORKLET_PROCESSOR_NAME}', RealtimeMicCaptureProcessor) ` } -function trackPendingAppend( - pending: Set>, - append: Promise, - onError: (error: unknown) => void -): void { - let tracked: Promise - tracked = append.catch(onError).finally(() => { - pending.delete(tracked) - }) - pending.add(tracked) -} - function streamHandle( baseUrl: string, path: string, - contentType: string + contentType: string, + opts: { retryWrites?: boolean } = {} ): DurableStream { const url = streamUrl(baseUrl, path) return new DurableStream({ url, headers: getConfiguredServerHeaders(url), contentType, + ...(opts.retryWrites === false ? { backoffOptions: NO_RETRY_BACKOFF } : {}), batching: true, }) } @@ -323,15 +384,153 @@ export async function startRealtimeAudioSession({ let currentOutputStartedAt: number | null = null let currentOutputReceivedMs = 0 let micChunks = 0 + let micSentChunks = 0 let playbackChunks = 0 let controlEvents = 0 - const playbackNodes = new Set() + let speechTurns = 0 + let voiceCandidateChunks = 0 + let noiseFloor = MIC_NOISE_FLOOR_INITIAL + let speechActive = false + let lastVoiceAt = 0 + let audioQueuedBytes = 0 + let audioInputStopping = false + let audioInputError: Error | undefined + let wakeAudioInputWriter: (() => void) | undefined + let activeResponseId: string | undefined + let responseActive = false + const preSpeechChunks: Array = [] + const audioQueue: Array = [] const pendingAudioAppends = new Set>() + const playbackNodes = new Set() + let audioInputWriter = Promise.resolve() const appendControl = async (value: unknown): Promise => { await controlIn?.append(jsonBytes(value)) } + const wakeAudioWriter = (): void => { + wakeAudioInputWriter?.() + wakeAudioInputWriter = undefined + } + + const playbackIsActive = (): boolean => + playbackNodes.size > 0 || + nextPlaybackTime > playbackContext.currentTime + 0.05 + + const trimPreSpeechChunks = (): void => { + const maxBytes = durationBytes(MIC_PRE_ROLL_MS) + let total = preSpeechChunks.reduce( + (sum, chunk) => sum + chunk.byteLength, + 0 + ) + while (total > maxBytes && preSpeechChunks.length > 0) { + const dropped = preSpeechChunks.shift()! + total -= dropped.byteLength + } + } + + const rememberPreSpeechChunk = (bytes: Uint8Array): void => { + preSpeechChunks.push(bytes) + trimPreSpeechChunks() + } + + const dropStaleAudio = (): void => { + const maxBytes = durationBytes(MIC_MAX_QUEUE_MS) + while (audioQueuedBytes > maxBytes && audioQueue.length > 1) { + const dropped = audioQueue.shift()! + audioQueuedBytes -= dropped.byteLength + } + } + + const enqueueAudioInput = (bytes: Uint8Array): void => { + audioQueue.push(bytes) + audioQueuedBytes += bytes.byteLength + dropStaleAudio() + wakeAudioWriter() + } + + const dequeueAudioBatch = (): Uint8Array | null => { + if (audioQueue.length === 0) return null + const maxBytes = durationBytes(MIC_APPEND_BATCH_MS) + let batchBytes = 0 + const chunks: Array = [] + while (audioQueue.length > 0) { + const next = audioQueue[0]! + if (chunks.length > 0 && batchBytes + next.byteLength > maxBytes) break + chunks.push(audioQueue.shift()!) + batchBytes += next.byteLength + audioQueuedBytes -= next.byteLength + } + return combineChunks(chunks) + } + + const toError = (value: unknown): Error => + value instanceof Error ? value : new Error(String(value)) + + const throwIfAudioInputFailed = (): void => { + if (audioInputError) throw audioInputError + } + + const waitForPendingAudioInput = async ( + timeoutMs = MIC_APPEND_DRAIN_WAIT_MS + ): Promise => { + if (pendingAudioAppends.size > 0) { + await Promise.race([ + Promise.all(Array.from(pendingAudioAppends)), + new Promise((resolve) => setTimeout(resolve, timeoutMs)), + ]) + } + throwIfAudioInputFailed() + } + + const trackAudioAppend = ( + audioIn: DurableStream, + batch: Uint8Array + ): void => { + const append = audioIn + .append(batch) + .then(() => { + micSentChunks += 1 + if (micSentChunks === 1) { + console.info( + `[realtime-audio] microphone first sent chunk session=${session?.sessionId} bytes=${batch.byteLength}` + ) + } + }) + .catch((error) => { + audioInputError ??= toError(error) + console.warn(`[realtime-audio] microphone append failed`, error) + }) + .finally(() => { + pendingAudioAppends.delete(append) + }) + pendingAudioAppends.add(append) + } + + const runAudioInputWriter = async (audioIn: DurableStream): Promise => { + while ( + !audioInputStopping || + audioQueue.length > 0 || + pendingAudioAppends.size > 0 + ) { + throwIfAudioInputFailed() + const batch = dequeueAudioBatch() + if (batch) { + trackAudioAppend(audioIn, batch) + continue + } + + if (audioInputStopping && pendingAudioAppends.size > 0) { + await waitForPendingAudioInput(250) + continue + } + + await new Promise((resolve) => { + wakeAudioInputWriter = resolve + }) + } + } + const stopScheduledPlayback = (): void => { for (const node of playbackNodes) { try { @@ -352,8 +551,18 @@ export async function startRealtimeAudioSession({ currentOutputReceivedMs = 0 } - const interruptPlayback = (): void => { + const interruptPlayback = ({ + cancelResponse = true, + }: { cancelResponse?: boolean } = {}): void => { const itemId = currentOutputItemId + const wasResponseActive = responseActive + responseActive = false + if (cancelResponse && (wasResponseActive || itemId)) { + void appendControl({ type: `response.cancel` }).catch((error) => { + console.warn(`[realtime-audio] response cancel failed`, error) + }) + } + if (!itemId) { stopScheduledPlayback() return @@ -386,22 +595,31 @@ export async function startRealtimeAudioSession({ } const cleanup = async (sendClose: boolean): Promise => { - abort.abort() micCapture?.cleanup() micCapture?.node.disconnect() silentOutput?.disconnect() source?.disconnect() onInputLevel?.(0) for (const track of media?.getTracks() ?? []) track.stop() + audioInputStopping = true + wakeAudioWriter() + await settleWithin(audioInputWriter, 250) + abort.abort() stopScheduledPlayback() - await Promise.allSettled(pendingAudioAppends) + await settleWithin(audioInputWriter, 250) if (sendClose && controlIn) { - await appendControl({ - type: `session.close`, - reason: `client-stop`, - }).catch(() => undefined) + await settleWithin( + appendControl({ + type: `session.close`, + reason: `client-stop`, + }).catch(() => undefined), + 500 + ) } - await Promise.allSettled([playback, control]) + await Promise.allSettled([ + settleWithin(playback, 250), + settleWithin(control, 250), + ]) await Promise.allSettled([micContext.close(), playbackContext.close()]) } @@ -423,7 +641,8 @@ export async function startRealtimeAudioSession({ const audioIn = streamHandle( baseUrl, session.streams.audio_in, - `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1` + `audio/pcm; rate=${REALTIME_SAMPLE_RATE}; channels=1`, + { retryWrites: false } ) const audioOut = streamHandle( baseUrl, @@ -433,13 +652,19 @@ export async function startRealtimeAudioSession({ controlIn = streamHandle( baseUrl, session.streams.control_in, - `application/json` + `application/json`, + { retryWrites: false } ) const controlOut = streamHandle( baseUrl, session.streams.control_out, `application/json` ) + audioInputWriter = runAudioInputWriter(audioIn).catch((error) => { + if (!abort.signal.aborted) { + console.warn(`[realtime-audio] microphone writer failed`, error) + } + }) const handleInputAudio = (bytes: Uint8Array, level: number): void => { if (abort.signal.aborted) return @@ -450,13 +675,60 @@ export async function startRealtimeAudioSession({ `[realtime-audio] microphone first chunk session=${session?.sessionId} bytes=${bytes.byteLength}` ) } - trackPendingAppend( - pendingAudioAppends, - audioIn.append(bytes), - (error) => { - console.warn(`[realtime-audio] microphone append failed`, error) - } + rememberPreSpeechChunk(bytes) + + const now = performance.now() + const startThreshold = Math.max( + MIC_MIN_START_LEVEL, + noiseFloor + MIC_NOISE_MARGIN_START, + playbackIsActive() ? MIC_PLAYBACK_START_LEVEL : 0 + ) + const continueThreshold = Math.max( + MIC_MIN_CONTINUE_LEVEL, + noiseFloor + MIC_NOISE_MARGIN_CONTINUE ) + const hasVoice = + level >= (speechActive ? continueThreshold : startThreshold) + + if (hasVoice) { + lastVoiceAt = now + if (!speechActive) { + voiceCandidateChunks += 1 + const requiredChunks = playbackIsActive() + ? MIC_PLAYBACK_START_CONFIRM_CHUNKS + : MIC_START_CONFIRM_CHUNKS + if (voiceCandidateChunks < requiredChunks) return + + voiceCandidateChunks = 0 + speechActive = true + speechTurns += 1 + console.info( + `[realtime-audio] microphone voice gate opened session=${session?.sessionId} turn=${speechTurns} level=${level.toFixed(4)} threshold=${startThreshold.toFixed(4)} noiseFloor=${noiseFloor.toFixed(4)}` + ) + for (const chunk of preSpeechChunks.splice(0)) { + enqueueAudioInput(chunk) + } + return + } + enqueueAudioInput(bytes) + return + } + + voiceCandidateChunks = 0 + + if (speechActive) { + if (now - lastVoiceAt < MIC_VAD_TAIL_MS) { + enqueueAudioInput(bytes) + return + } + speechActive = false + } + + if (!speechActive && level < startThreshold) { + noiseFloor = + noiseFloor * (1 - MIC_NOISE_FLOOR_ALPHA) + + Math.min(level, MIC_NOISE_FLOOR_MAX) * MIC_NOISE_FLOOR_ALPHA + } } source = micContext.createMediaStreamSource(media) micCapture = await createMicCapture(micContext, handleInputAudio) @@ -475,6 +747,7 @@ export async function startRealtimeAudioSession({ signal: abort.signal, warnOnHttp: false, }) + let playbackRemainder: Uint8Array | undefined try { for await (const chunk of response.bodyStream()) { if (abort.signal.aborted || chunk.byteLength === 0) continue @@ -484,7 +757,11 @@ export async function startRealtimeAudioSession({ `[realtime-audio] playback first chunk session=${session?.sessionId} bytes=${chunk.byteLength}` ) } - const samples = pcm16Floats(chunk) + const aligned = alignedPcm16Chunk(chunk, playbackRemainder) + playbackRemainder = aligned.remainder + if (aligned.bytes.byteLength === 0) continue + + const samples = pcm16Floats(aligned.bytes) const buffer = playbackContext.createBuffer( 1, samples.length, @@ -533,7 +810,25 @@ export async function startRealtimeAudioSession({ `[realtime-audio] control first event session=${session?.sessionId} type=${event.type}` ) } - if ( + if (event.type === `response.started`) { + activeResponseId = + typeof event.responseId === `string` + ? event.responseId + : undefined + responseActive = true + } else if ( + event.type === `response.completed` || + event.type === `response.cancelled` + ) { + if ( + !activeResponseId || + typeof event.responseId !== `string` || + event.responseId === activeResponseId + ) { + activeResponseId = undefined + responseActive = false + } + } else if ( event.type === `output_audio.delta` && typeof event.itemId === `string` ) { @@ -542,7 +837,7 @@ export async function startRealtimeAudioSession({ currentOutputReceivedMs += pcm16DurationMs(event.byteLength) } } else if (event.type === `input_audio.speech_started`) { - interruptPlayback() + interruptPlayback({ cancelResponse: false }) } } } finally { diff --git a/packages/agents-server/src/routing/hooks.ts b/packages/agents-server/src/routing/hooks.ts index 0aea49744d..090b57c093 100644 --- a/packages/agents-server/src/routing/hooks.ts +++ b/packages/agents-server/src/routing/hooks.ts @@ -88,6 +88,15 @@ export function applyCors( `electric-owner-entity`, ELECTRIC_PRINCIPAL_HEADER, `ngrok-skip-browser-warning`, + `producer-id`, + `producer-epoch`, + `producer-seq`, + `producer-expected-seq`, + `producer-received-seq`, + `stream-closed`, + `stream-expires-at`, + `stream-seq`, + `stream-ttl`, ].join(`, `) ) headers.set(`access-control-expose-headers`, `*`) diff --git a/packages/agents-server/test/routing-hooks.test.ts b/packages/agents-server/test/routing-hooks.test.ts index 84cb3761e5..d82028052a 100644 --- a/packages/agents-server/test/routing-hooks.test.ts +++ b/packages/agents-server/test/routing-hooks.test.ts @@ -41,9 +41,12 @@ describe(`routing/hooks`, () => { expect(wrapped?.headers.get(`access-control-allow-methods`)).toContain( `GET` ) - expect(wrapped?.headers.get(`access-control-allow-headers`)).toContain( - `electric-principal` - ) + const allowedHeaders = wrapped?.headers.get(`access-control-allow-headers`) + expect(allowedHeaders).toContain(`electric-principal`) + expect(allowedHeaders).toContain(`producer-id`) + expect(allowedHeaders).toContain(`producer-epoch`) + expect(allowedHeaders).toContain(`producer-seq`) + expect(allowedHeaders).toContain(`stream-closed`) }) it(`errorMapper converts ElectricAgentsError to API error JSON`, async () => { diff --git a/packages/agents/src/agents/horton.ts b/packages/agents/src/agents/horton.ts index 49679d2ef0..aaa25f8727 100644 --- a/packages/agents/src/agents/horton.ts +++ b/packages/agents/src/agents/horton.ts @@ -771,6 +771,18 @@ function createAssistantHandler(options: { audio: { inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, outputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + inputTranscription: { + model: `gpt-realtime-whisper`, + delay: `minimal`, + }, + turnDetection: { + type: `server_vad`, + threshold: 0.55, + prefixPaddingMs: 300, + silenceDurationMs: 500, + createResponse: true, + interruptResponse: true, + }, }, toolPolicy: { direct: hortonRealtimeDirectTools(tools as AgentTool[]), diff --git a/packages/agents/test/horton-tool-composition.test.ts b/packages/agents/test/horton-tool-composition.test.ts index f2caa8db41..5e0874bb14 100644 --- a/packages/agents/test/horton-tool-composition.test.ts +++ b/packages/agents/test/horton-tool-composition.test.ts @@ -201,6 +201,12 @@ describe(`horton tool composition`, () => { [ { provider: { id: string; model: string } + audio: { + inputTranscription?: { + model?: string + delay?: string + } + } toolPolicy: { direct: Array } }, ] @@ -210,6 +216,10 @@ describe(`horton tool composition`, () => { id: `openai`, model: `gpt-realtime`, }) + expect(realtimeConfig.audio.inputTranscription).toEqual({ + model: `gpt-realtime-whisper`, + delay: `minimal`, + }) expect(realtimeConfig.toolPolicy.direct).toEqual( expect.arrayContaining([ `web_search`, From 2b213fdaf180413113bf401e16f4e18972e40b3b Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Wed, 10 Jun 2026 09:13:29 +0100 Subject: [PATCH 26/31] fix(agents): use gpt-realtime-2 --- packages/agents-runtime/src/openai-realtime.ts | 2 +- .../test/electric-agents-client.test.ts | 6 +++--- .../test/openai-realtime.test.ts | 4 ++-- .../test/realtime-context.test.ts | 18 +++++++++--------- ...ntime-server-client-update-metadata.test.ts | 8 ++++---- .../agents-server-ui/src/lib/realtime-audio.ts | 2 +- ...ric-agents-manager-write-validation.test.ts | 6 +++--- .../test/horton-tool-composition.test.ts | 4 ++-- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts index b7111d8342..caa48d6a8f 100644 --- a/packages/agents-runtime/src/openai-realtime.ts +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -31,7 +31,7 @@ type OpenAIRealtimeWebSocketConstructor = new ( init?: unknown ) => OpenAIRealtimeSocket -const DEFAULT_OPENAI_REALTIME_MODEL = `gpt-realtime` +const DEFAULT_OPENAI_REALTIME_MODEL = `gpt-realtime-2` const DEFAULT_OPENAI_INPUT_TRANSCRIPTION_MODEL = `gpt-4o-mini-transcribe` export interface OpenAIRealtimeProviderOptions { diff --git a/packages/agents-runtime/test/electric-agents-client.test.ts b/packages/agents-runtime/test/electric-agents-client.test.ts index b493cc3aa7..309d7512df 100644 --- a/packages/agents-runtime/test/electric-agents-client.test.ts +++ b/packages/agents-runtime/test/electric-agents-client.test.ts @@ -55,7 +55,7 @@ describe(`createAgentsClient`, () => { sessionId: `rt-1`, entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `requested`, startedAt: `2026-06-09T10:00:00.000Z`, streams: { @@ -176,7 +176,7 @@ describe(`createAgentsClient`, () => { client.startRealtimeSession({ entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, }) ).resolves.toMatchObject({ sessionId: `rt-1`, @@ -188,7 +188,7 @@ describe(`createAgentsClient`, () => { expect(mockState.startRealtimeSession).toHaveBeenCalledWith({ entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, }) }) diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts index 7beba32642..f6703faaf9 100644 --- a/packages/agents-runtime/test/openai-realtime.test.ts +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -76,7 +76,7 @@ describe(`createOpenAIRealtimeProvider`, () => { const socket = FakeWebSocket.instances[0]! expect(socket.url).toBe( - `wss://api.openai.com/v1/realtime?model=gpt-realtime` + `wss://api.openai.com/v1/realtime?model=gpt-realtime-2` ) expect(socket.init).toEqual({ headers: { @@ -88,7 +88,7 @@ describe(`createOpenAIRealtimeProvider`, () => { type: `session.update`, session: { type: `realtime`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, instructions: `You are Horton.`, output_modalities: [`audio`], tool_choice: `auto`, diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts index 66b2d81bb1..5de390de11 100644 --- a/packages/agents-runtime/test/realtime-context.test.ts +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -519,7 +519,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -546,7 +546,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `requested`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -592,7 +592,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `requested`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -609,7 +609,7 @@ describe(`ctx.useRealtime()`, () => { systemPrompt: `You are realtime.`, provider: { id: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, connect: async () => { throw new Error(`missing key`) }, @@ -694,7 +694,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -772,7 +772,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -844,7 +844,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -917,7 +917,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, @@ -986,7 +986,7 @@ describe(`ctx.useRealtime()`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, endedAt: null, diff --git a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts index 3290f6b91a..c88a9ebad4 100644 --- a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts +++ b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts @@ -143,7 +143,7 @@ describe(`runtime-server-client realtime sessions`, () => { sessionId: `rt-1`, entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `requested`, startedAt: `2026-06-09T10:00:00.000Z`, streams: { @@ -171,7 +171,7 @@ describe(`runtime-server-client realtime sessions`, () => { entityUrl: `/horton/demo`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, meta: { source: `button` }, }) @@ -189,7 +189,7 @@ describe(`runtime-server-client realtime sessions`, () => { entityUrl: `/horton/demo`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, meta: { source: `button` }, }) @@ -208,7 +208,7 @@ describe(`runtime-server-client realtime sessions`, () => { client.startRealtimeSession({ entityUrl: `/horton/demo`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, }) ).rejects.toThrow(/startRealtimeSession.*401.*not allowed/) }) diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index c4a47279fc..155387948a 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -333,7 +333,7 @@ async function createRealtimeSession( body: JSON.stringify({ entityUrl, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, inputAudio: { codec: `pcm16`, sampleRate: REALTIME_SAMPLE_RATE, diff --git a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts index e8d309db32..085d851ec8 100644 --- a/packages/agents-server/test/electric-agents-manager-write-validation.test.ts +++ b/packages/agents-server/test/electric-agents-manager-write-validation.test.ts @@ -158,7 +158,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { const result = await manager.createRealtimeSession(`/chat/session-1`, { id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, outputAudio: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, meta: { source: `test` }, @@ -203,7 +203,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `requested`, streams: result.streams, retention: `forever`, @@ -216,7 +216,7 @@ describe(`ElectricAgentsManager realtime sessions`, () => { value: { session_id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `requested`, streams: result.streams, }, diff --git a/packages/agents/test/horton-tool-composition.test.ts b/packages/agents/test/horton-tool-composition.test.ts index 5e0874bb14..5de45c1220 100644 --- a/packages/agents/test/horton-tool-composition.test.ts +++ b/packages/agents/test/horton-tool-composition.test.ts @@ -167,7 +167,7 @@ describe(`horton tool composition`, () => { kind: `realtime-session`, id: `rt-1`, provider: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, status: `active`, startedAt: `2026-06-09T12:00:00.000Z`, retention: `forever`, @@ -214,7 +214,7 @@ describe(`horton tool composition`, () => { )[0]![0] expect(realtimeConfig.provider).toMatchObject({ id: `openai`, - model: `gpt-realtime`, + model: `gpt-realtime-2`, }) expect(realtimeConfig.audio.inputTranscription).toEqual({ model: `gpt-realtime-whisper`, From 3bc5e2d27b027e92f3bc7f4232f57bfe0dcbd253 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Wed, 10 Jun 2026 09:41:55 +0100 Subject: [PATCH 27/31] feat(agents): expose realtime settings --- packages/agents-desktop/src/app/controller.ts | 18 ++ .../agents-desktop/src/ipc/preferences.ts | 11 + packages/agents-desktop/src/preload.ts | 6 + .../agents-desktop/src/settings/realtime.ts | 66 ++++++ packages/agents-desktop/src/settings/store.ts | 8 +- packages/agents-desktop/src/shared/types.ts | 22 ++ .../components/settings/SettingsSidebar.tsx | 8 + .../settings/pages/RealtimePage.module.css | 61 +++++ .../settings/pages/RealtimePage.tsx | 215 ++++++++++++++++++ .../src/hooks/useDocumentTitle.ts | 1 + .../src/lib/realtime-audio.ts | 4 +- .../src/lib/server-connection.ts | 58 +++++ packages/agents-server-ui/src/router.tsx | 4 + 13 files changed, 480 insertions(+), 2 deletions(-) create mode 100644 packages/agents-desktop/src/settings/realtime.ts create mode 100644 packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css create mode 100644 packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx diff --git a/packages/agents-desktop/src/app/controller.ts b/packages/agents-desktop/src/app/controller.ts index 1560564d4b..0b7f961fa1 100644 --- a/packages/agents-desktop/src/app/controller.ts +++ b/packages/agents-desktop/src/app/controller.ts @@ -11,6 +11,7 @@ import * as DesktopIpc from '../ipc/register' import { ensureRuntimeEntry as ensureRuntimeEntryInStore } from '../runtime/entries' import { createRuntimeController } from '../runtime/controller' import * as SettingsBootstrap from '../settings/bootstrap' +import * as RealtimeSettings from '../settings/realtime' import * as ServerSelection from '../settings/selection' import { saveDesktopSettings } from '../settings/store' import { desktopStateForWindow as desktopStateForWindowImpl } from '../state/desktop-state' @@ -30,6 +31,7 @@ import type { DesktopMenuSection, DesktopMenuState, DesktopState, + RealtimeSettings as RealtimeSettingsConfig, RuntimeEntry, ServerConfig, } from '../shared/types' @@ -328,6 +330,20 @@ export function createDesktopMainController(ctx: DesktopAppContext) { runtime.refreshPowerSaveBlocker() } + const getRealtimeSettingsStatus = () => + RealtimeSettings.realtimeSettingsStatus({ + settings, + apiKeys, + launchEnv: ctx.envApiKeysSnapshot, + }) + + const setRealtimeSettings = async ( + next: RealtimeSettingsConfig + ): Promise => { + settings.realtime = RealtimeSettings.normalizeRealtimeSettings(next) + await saveSettings() + } + const syncLaunchAtLoginSetting = async (): Promise => { await LoginItems.setLaunchAtLogin(settings.launchAtLogin === true) } @@ -438,6 +454,8 @@ export function createDesktopMainController(ctx: DesktopAppContext) { setLaunchAtLogin, getPreventAppSuspension, setPreventAppSuspension, + getRealtimeSettingsStatus, + setRealtimeSettings, } const loadSettings = (): Promise => diff --git a/packages/agents-desktop/src/ipc/preferences.ts b/packages/agents-desktop/src/ipc/preferences.ts index cfd50bab3f..2a934afd98 100644 --- a/packages/agents-desktop/src/ipc/preferences.ts +++ b/packages/agents-desktop/src/ipc/preferences.ts @@ -2,6 +2,8 @@ import { ipcMain } from 'electron' import type { LaunchAtLoginStatus, PreventAppSuspensionPreference, + RealtimeSettings, + RealtimeSettingsStatus, } from '../shared/types' export type PreferencesIpcDeps = { @@ -9,6 +11,8 @@ export type PreferencesIpcDeps = { setLaunchAtLogin: (enabled: boolean) => Promise getPreventAppSuspension: () => PreventAppSuspensionPreference setPreventAppSuspension: (enabled: boolean) => Promise + getRealtimeSettingsStatus: () => RealtimeSettingsStatus + setRealtimeSettings: (settings: RealtimeSettings) => Promise } export function registerPreferencesIpcHandlers(deps: PreferencesIpcDeps): void { @@ -25,4 +29,11 @@ export function registerPreferencesIpcHandlers(deps: PreferencesIpcDeps): void { `desktop:set-prevent-app-suspension`, (_event, enabled: boolean) => deps.setPreventAppSuspension(Boolean(enabled)) ) + ipcMain.handle(`desktop:get-realtime-settings`, () => + deps.getRealtimeSettingsStatus() + ) + ipcMain.handle( + `desktop:set-realtime-settings`, + (_event, settings: RealtimeSettings) => deps.setRealtimeSettings(settings) + ) } diff --git a/packages/agents-desktop/src/preload.ts b/packages/agents-desktop/src/preload.ts index 82c437a935..af780edb47 100644 --- a/packages/agents-desktop/src/preload.ts +++ b/packages/agents-desktop/src/preload.ts @@ -21,6 +21,8 @@ import type { McpServerConfig, OnboardingState, PreventAppSuspensionPreference, + RealtimeSettings, + RealtimeSettingsStatus, ServerConfig, } from './shared/types' import type { CloudAgentServersState } from './cloud/cloud-agent-servers' @@ -190,6 +192,10 @@ const api = { ipcRenderer.invoke(`desktop:get-prevent-app-suspension`), setPreventAppSuspension: (enabled: boolean): Promise => ipcRenderer.invoke(`desktop:set-prevent-app-suspension`, enabled), + getRealtimeSettings: (): Promise => + ipcRenderer.invoke(`desktop:get-realtime-settings`), + setRealtimeSettings: (settings: RealtimeSettings): Promise => + ipcRenderer.invoke(`desktop:set-realtime-settings`, settings), getWorkingDirectory: (): Promise => ipcRenderer.invoke(`desktop:get-working-directory`), chooseWorkingDirectory: (): Promise => diff --git a/packages/agents-desktop/src/settings/realtime.ts b/packages/agents-desktop/src/settings/realtime.ts new file mode 100644 index 0000000000..e9481c6221 --- /dev/null +++ b/packages/agents-desktop/src/settings/realtime.ts @@ -0,0 +1,66 @@ +import type { + ApiKeys, + DesktopSettings, + RealtimeModelChoice, + RealtimeSettings, + RealtimeSettingsStatus, +} from '../shared/types' + +export const DEFAULT_REALTIME_SETTINGS: RealtimeSettings = { + provider: `openai`, + model: `gpt-realtime-2`, +} + +export const OPENAI_REALTIME_MODELS: Array = [ + { + id: `gpt-realtime-2`, + label: `GPT-Realtime-2`, + description: `Strongest realtime reasoning, tool use, and instruction following.`, + recommended: true, + }, + { + id: `gpt-realtime-1.5`, + label: `GPT-Realtime-1.5`, + description: `Fast, reliable speech-to-speech model for audio in, audio out.`, + }, + { + id: `gpt-realtime-mini`, + label: `GPT-Realtime mini`, + description: `Cost-efficient realtime voice model.`, + }, +] + +const OPENAI_REALTIME_MODEL_IDS = new Set( + OPENAI_REALTIME_MODELS.map((model) => model.id) +) + +export function normalizeRealtimeSettings(value: unknown): RealtimeSettings { + if (!value || typeof value !== `object`) return DEFAULT_REALTIME_SETTINGS + const maybe = value as Partial> + const model = + typeof maybe.model === `string` && + OPENAI_REALTIME_MODEL_IDS.has(maybe.model) + ? maybe.model + : DEFAULT_REALTIME_SETTINGS.model + return { + provider: `openai`, + model, + } +} + +export function realtimeSettingsStatus({ + settings, + apiKeys, + launchEnv, +}: { + settings: DesktopSettings + apiKeys: ApiKeys + launchEnv: ApiKeys +}): RealtimeSettingsStatus { + return { + settings: normalizeRealtimeSettings(settings.realtime), + availableModels: OPENAI_REALTIME_MODELS, + hasOpenAIApiKey: Boolean(apiKeys.openai || launchEnv.openai), + codexEnabled: settings.codex?.enabled === true, + } +} diff --git a/packages/agents-desktop/src/settings/store.ts b/packages/agents-desktop/src/settings/store.ts index ece7ca46c2..9465066a61 100644 --- a/packages/agents-desktop/src/settings/store.ts +++ b/packages/agents-desktop/src/settings/store.ts @@ -17,11 +17,15 @@ import { saveApiKeysToSecret, } from '../credentials/api-keys' import { normalizeEnabledModelValues } from '../credentials/model-picker' +import { + DEFAULT_REALTIME_SETTINGS, + normalizeRealtimeSettings, +} from './realtime' import { normalizeServer, normalizeServers } from './servers' export { settingsPath } from '../shared/paths' -export const SETTINGS_VERSION = 2 +export const SETTINGS_VERSION = 3 export const DEFAULT_SETTINGS: DesktopSettings = { servers: [], @@ -31,6 +35,7 @@ export const DEFAULT_SETTINGS: DesktopSettings = { launchAtLogin: false, preventAppSuspension: true, codex: { enabled: false, source: null }, + realtime: DEFAULT_REALTIME_SETTINGS, } export function normalizeCodexSettings(value: unknown): CodexSettings { @@ -165,6 +170,7 @@ export async function loadDesktopSettings( preventAppSuspension: parsed.preventAppSuspension !== false, onboardingDismissed: parsed.onboardingDismissed === true, codex: normalizeCodexSettings(parsed.codex), + realtime: normalizeRealtimeSettings(parsed.realtime), enabledModelValues: enabledModelValues.length > 0 ? enabledModelValues : undefined, mcp: normalizeMcp(parsed.mcp), diff --git a/packages/agents-desktop/src/shared/types.ts b/packages/agents-desktop/src/shared/types.ts index 7f16145739..d5d6ed3945 100644 --- a/packages/agents-desktop/src/shared/types.ts +++ b/packages/agents-desktop/src/shared/types.ts @@ -122,6 +122,27 @@ export type CodexSettings = { source: CodexAuthSource | null } +export type RealtimeProvider = `openai` + +export type RealtimeSettings = { + provider: RealtimeProvider + model: string +} + +export type RealtimeModelChoice = { + id: string + label: string + description: string + recommended?: boolean +} + +export type RealtimeSettingsStatus = { + settings: RealtimeSettings + availableModels: Array + hasOpenAIApiKey: boolean + codexEnabled: boolean +} + export type DesktopSettings = { servers: Array defaultServerId: string | null @@ -131,6 +152,7 @@ export type DesktopSettings = { preventAppSuspension?: boolean codex?: CodexSettings enabledModelValues?: Array + realtime?: RealtimeSettings onboardingDismissed?: boolean mcp?: { servers: Array } seededDefaultMcpServerNames?: Array diff --git a/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx b/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx index adbb6d5c95..30019938fa 100644 --- a/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx +++ b/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx @@ -6,6 +6,7 @@ import { KeyRound, Palette, Plug, + RadioTower, Server, Settings as SettingsIcon, Terminal, @@ -21,6 +22,7 @@ export type SettingsCategoryId = | `account` | `servers` | `credentials` + | `realtime` | `command-line` | `appearance` | `local-runtime` @@ -105,6 +107,12 @@ export function SettingsSidebar({ icon: , visible: true, }, + { + id: `realtime`, + label: `Realtime`, + icon: , + visible: true, + }, { id: `command-line`, label: `Command Line`, diff --git a/packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css new file mode 100644 index 0000000000..f7681ab603 --- /dev/null +++ b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css @@ -0,0 +1,61 @@ +.modelSelect { + min-width: 240px; +} + +.modelList { + display: flex; + flex-direction: column; + gap: 0; +} + +.modelItem { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 16px; + padding: 12px 0; + border-top: 1px solid var(--ds-border-1); +} + +.modelItem:first-child { + padding-top: 0; + border-top: 0; +} + +.modelItem:last-child { + padding-bottom: 0; +} + +.modelText { + min-width: 0; + display: flex; + flex-direction: column; + gap: 4px; +} + +.modelTitle { + display: inline-flex; + align-items: center; + gap: 6px; + min-width: 0; + color: var(--ds-text-1); + font-size: var(--ds-text-sm); +} + +.modelId { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + color: var(--ds-text-3); + font-size: var(--ds-text-xs); +} + +.modelDescription { + color: var(--ds-text-3); + font-size: var(--ds-text-xs); + line-height: 1.45; +} + +.recommended { + flex-shrink: 0; +} diff --git a/packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx new file mode 100644 index 0000000000..bbb491c562 --- /dev/null +++ b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx @@ -0,0 +1,215 @@ +import { useEffect, useMemo, useState } from 'react' +import { useNavigate } from '@tanstack/react-router' +import { + loadRealtimeSettingsStatus, + saveRealtimeSettings, + type RealtimeSettingsStatus, +} from '../../../lib/server-connection' +import { Button, Select, Text } from '../../../ui' +import { + SettingsPanel, + SettingsRow, + SettingsScreen, + SettingsSection, + SettingsStatusBadge, +} from '../SettingsScreen' +import styles from './RealtimePage.module.css' + +export function RealtimePage(): React.ReactElement { + const isDesktop = typeof window !== `undefined` && Boolean(window.electronAPI) + const navigate = useNavigate() + const [status, setStatus] = useState(null) + const [saving, setSaving] = useState(false) + const [error, setError] = useState(null) + + useEffect(() => { + let cancelled = false + void loadRealtimeSettingsStatus().then((next) => { + if (cancelled) return + setStatus(next) + }) + return () => { + cancelled = true + } + }, []) + + const modelById = useMemo( + () => new Map(status?.availableModels.map((model) => [model.id, model])), + [status?.availableModels] + ) + const selectedModel = status ? modelById.get(status.settings.model) : null + + const saveModel = async (model: string | null): Promise => { + if (!model || !status) return + const next = { + ...status, + settings: { ...status.settings, model }, + } + setStatus(next) + setSaving(true) + setError(null) + try { + await saveRealtimeSettings(next.settings) + } catch (err) { + setStatus(status) + setError(err instanceof Error ? err.message : String(err)) + } finally { + setSaving(false) + } + } + + return ( + + + {!isDesktop ? ( + + + Realtime settings are managed by the connected desktop or server + runtime. This web build uses the default model when starting a + session from the browser. + + + ) : !status ? ( + + + Loading… + + + ) : ( + <> + + + {status.hasOpenAIApiKey ? `Ready` : `API key required`} + + + + } + /> + OpenAI + } + /> + { + void saveModel(model) + }} + disabled={saving} + > + + model ? (modelById.get(model)?.label ?? model) : `Model` + } + /> + + {status.availableModels.map((model) => ( + + {model.label} + + ))} + + + } + /> + {saving && ( + + + Saving… + + + )} + {error && ( + + + {error} + + + )} + + )} + + + {status && ( + + +
+ {status.availableModels.map((model) => ( +
+
+ + {model.label} + {model.recommended && ( + + Recommended + + )} + + {model.id} + + {model.description} + +
+ {model.id === status.settings.model && ( + + + Selected + + + )} +
+ ))} +
+
+
+ )} +
+ ) +} + +function authDescription(status: RealtimeSettingsStatus): string { + if (status.hasOpenAIApiKey) { + return `Realtime sessions connect to the OpenAI Realtime API with your OpenAI API key.` + } + if (status.codexEnabled) { + return `ChatGPT / Codex sign-in is enabled, but realtime voice still needs an OpenAI API key.` + } + return `Add an OpenAI API key in Credentials. ChatGPT / Codex sign-in alone does not grant Realtime API access.` +} diff --git a/packages/agents-server-ui/src/hooks/useDocumentTitle.ts b/packages/agents-server-ui/src/hooks/useDocumentTitle.ts index 5477f47c64..2cd6747319 100644 --- a/packages/agents-server-ui/src/hooks/useDocumentTitle.ts +++ b/packages/agents-server-ui/src/hooks/useDocumentTitle.ts @@ -10,6 +10,7 @@ const APP_NAME = `Electric Agents` const SETTINGS_CATEGORY_LABELS: Record = { general: `General`, appearance: `Appearance`, + realtime: `Realtime`, 'local-runtime': `Local Runtime`, } diff --git a/packages/agents-server-ui/src/lib/realtime-audio.ts b/packages/agents-server-ui/src/lib/realtime-audio.ts index 155387948a..521aff0473 100644 --- a/packages/agents-server-ui/src/lib/realtime-audio.ts +++ b/packages/agents-server-ui/src/lib/realtime-audio.ts @@ -1,6 +1,7 @@ import { DurableStream } from '@durable-streams/client' import { appendPathToUrl } from '@electric-ax/agents-runtime/client' import { serverFetch, getConfiguredServerHeaders } from './auth-fetch' +import { loadRealtimeSettingsStatus } from './server-connection' export type RealtimeAudioSession = { sessionId: string @@ -327,13 +328,14 @@ async function createRealtimeSession( baseUrl: string, entityUrl: string ): Promise { + const realtimeSettings = await loadRealtimeSettingsStatus() const response = await serverFetch(realtimeUrl(baseUrl), { method: `POST`, headers: { 'content-type': `application/json` }, body: JSON.stringify({ entityUrl, provider: `openai`, - model: `gpt-realtime-2`, + model: realtimeSettings.settings.model, inputAudio: { codec: `pcm16`, sampleRate: REALTIME_SAMPLE_RATE, diff --git a/packages/agents-server-ui/src/lib/server-connection.ts b/packages/agents-server-ui/src/lib/server-connection.ts index 865fab9b57..8b3f53af80 100644 --- a/packages/agents-server-ui/src/lib/server-connection.ts +++ b/packages/agents-server-ui/src/lib/server-connection.ts @@ -173,6 +173,49 @@ export interface ApiKeysStatus { modelPicker: ModelPickerStatus } +export type RealtimeSettings = { + provider: `openai` + model: string +} + +export type RealtimeModelChoice = { + id: string + label: string + description: string + recommended?: boolean +} + +export type RealtimeSettingsStatus = { + settings: RealtimeSettings + availableModels: Array + hasOpenAIApiKey: boolean + codexEnabled: boolean +} + +const DEFAULT_REALTIME_SETTINGS_STATUS: RealtimeSettingsStatus = { + settings: { provider: `openai`, model: `gpt-realtime-2` }, + availableModels: [ + { + id: `gpt-realtime-2`, + label: `GPT-Realtime-2`, + description: `Strongest realtime reasoning, tool use, and instruction following.`, + recommended: true, + }, + { + id: `gpt-realtime-1.5`, + label: `GPT-Realtime-1.5`, + description: `Fast, reliable speech-to-speech model for audio in, audio out.`, + }, + { + id: `gpt-realtime-mini`, + label: `GPT-Realtime mini`, + description: `Cost-efficient realtime voice model.`, + }, + ], + hasOpenAIApiKey: false, + codexEnabled: false, +} + /** * Snapshot consumed by the renderer's onboarding wizard. * @@ -376,6 +419,8 @@ declare global { setOnboardingDismissed?: (dismissed: boolean) => Promise getPreventAppSuspension?: () => Promise setPreventAppSuspension?: (enabled: boolean) => Promise + getRealtimeSettings?: () => Promise + setRealtimeSettings?: (settings: RealtimeSettings) => Promise getWorkingDirectory?: () => Promise chooseWorkingDirectory?: () => Promise /** @@ -591,6 +636,19 @@ export async function saveEnabledModels(values: Array): Promise { await window.electronAPI?.saveEnabledModels?.(values) } +export async function loadRealtimeSettingsStatus(): Promise { + return ( + (await window.electronAPI?.getRealtimeSettings?.()) ?? + DEFAULT_REALTIME_SETTINGS_STATUS + ) +} + +export async function saveRealtimeSettings( + settings: RealtimeSettings +): Promise { + await window.electronAPI?.setRealtimeSettings?.(settings) +} + export async function codexSignIn(): Promise { return (await window.electronAPI?.codexSignIn?.()) ?? null } diff --git a/packages/agents-server-ui/src/router.tsx b/packages/agents-server-ui/src/router.tsx index 64956671d5..449bf860cc 100644 --- a/packages/agents-server-ui/src/router.tsx +++ b/packages/agents-server-ui/src/router.tsx @@ -53,6 +53,7 @@ import { GeneralPage } from './components/settings/pages/GeneralPage' import { AccountPage } from './components/settings/pages/AccountPage' import { AppearancePage } from './components/settings/pages/AppearancePage' import { CredentialsPage } from './components/settings/pages/CredentialsPage' +import { RealtimePage } from './components/settings/pages/RealtimePage' import { CommandLinePage } from './components/settings/pages/CommandLinePage' import { ServersPage } from './components/settings/pages/ServersPage' import { McpServersPage } from './components/settings/pages/McpServersPage' @@ -64,6 +65,7 @@ const SETTINGS_CATEGORY_IDS: ReadonlyArray = [ `account`, `servers`, `credentials`, + `realtime`, `command-line`, `appearance`, `local-runtime`, @@ -564,6 +566,8 @@ function SettingsCategoryPage(): React.ReactElement { return case `credentials`: return + case `realtime`: + return case `command-line`: return case `local-runtime`: From f322210d5eaa089478522ca9e1b084e23d90fb76 Mon Sep 17 00:00:00 2001 From: Sam Willis Date: Wed, 10 Jun 2026 10:42:45 +0100 Subject: [PATCH 28/31] fix(agents): gate realtime controls on credentials --- .../src/components/MessageInput.tsx | 53 +++++++++--- .../src/components/views/NewSessionView.tsx | 15 +++- .../src/hooks/useRealtimeAvailability.ts | 82 +++++++++++++++++++ 3 files changed, 138 insertions(+), 12 deletions(-) create mode 100644 packages/agents-server-ui/src/hooks/useRealtimeAvailability.ts diff --git a/packages/agents-server-ui/src/components/MessageInput.tsx b/packages/agents-server-ui/src/components/MessageInput.tsx index b0cee09536..c3a3f4d29b 100644 --- a/packages/agents-server-ui/src/components/MessageInput.tsx +++ b/packages/agents-server-ui/src/components/MessageInput.tsx @@ -14,6 +14,7 @@ import { startRealtimeAudioSession, type RealtimeAudioSession, } from '../lib/realtime-audio' +import { useRealtimeAvailability } from '../hooks/useRealtimeAvailability' import { ComposerEditor } from './ComposerEditor' import { ComposerShell } from './ComposerShell' import { Icon, Stack, Text, Tooltip } from '../ui' @@ -99,6 +100,7 @@ export function MessageInput({ const realtimeSessionRef = useRef(null) const handledAutoStartRealtimeRef = useRef(null) const composerFocusRef = useRef<{ focus: () => void } | null>(null) + const realtimeAvailability = useRealtimeAvailability() const inputDisabled = disabled || writeDisabled const attachmentsDisabled = inputDisabled || Boolean(editingMessage) || !imageAttachmentsEnabled @@ -178,7 +180,11 @@ export function MessageInput({ attachmentCount === 0 && !disabled const canStop = showStop && !stopPending && !stopDisabled - const canUseRealtime = !inputDisabled && !editingMessage && Boolean(baseUrl) + const canStartRealtime = + !inputDisabled && + !editingMessage && + Boolean(baseUrl) && + realtimeAvailability.canStart useEffect(() => { return () => { @@ -270,7 +276,12 @@ export function MessageInput({ }) return } - if (!canUseRealtime) return + if (!canStartRealtime) { + if (realtimeAvailability.unavailableReason) { + setError(realtimeAvailability.unavailableReason) + } + return + } setRealtimePending(true) startRealtimeAudioSession({ baseUrl, @@ -288,12 +299,27 @@ export function MessageInput({ .finally(() => { setRealtimePending(false) }) - }, [baseUrl, canUseRealtime, entityUrl, realtimePending]) + }, [ + baseUrl, + canStartRealtime, + entityUrl, + realtimeAvailability.unavailableReason, + realtimePending, + ]) useEffect(() => { if (!autoStartRealtimeSignal) return if (handledAutoStartRealtimeRef.current === autoStartRealtimeSignal) return - if (!canUseRealtime || realtimePending) return + if (realtimeAvailability.loading || realtimePending) return + if (!realtimeAvailability.canStart) { + handledAutoStartRealtimeRef.current = autoStartRealtimeSignal + onRealtimeAutoStartConsumed?.() + if (realtimeAvailability.unavailableReason) { + setError(realtimeAvailability.unavailableReason) + } + return + } + if (!canStartRealtime) return handledAutoStartRealtimeRef.current = autoStartRealtimeSignal onRealtimeAutoStartConsumed?.() if (!realtimeSessionRef.current) { @@ -301,9 +327,12 @@ export function MessageInput({ } }, [ autoStartRealtimeSignal, - canUseRealtime, + canStartRealtime, handleRealtimeToggle, onRealtimeAutoStartConsumed, + realtimeAvailability.canStart, + realtimeAvailability.loading, + realtimeAvailability.unavailableReason, realtimePending, ]) @@ -393,6 +422,13 @@ export function MessageInput({ ? `Signal permission required` : `Stop generating` : `Send message` + const realtimeTooltip = realtimeActive + ? `Stop voice mode` + : realtimeAvailability.loading + ? `Checking realtime credentials` + : (realtimeAvailability.unavailableReason ?? `Start voice mode`) + const realtimeButtonDisabled = + realtimePending || (!realtimeActive && !canStartRealtime) return ( {drawer?.({ @@ -442,10 +478,7 @@ export function MessageInput({ } controls={ <> - +