diff --git a/.changeset/realtime-agents-voice-mode.md b/.changeset/realtime-agents-voice-mode.md new file mode 100644 index 0000000000..efe20c4899 --- /dev/null +++ b/.changeset/realtime-agents-voice-mode.md @@ -0,0 +1,9 @@ +--- +'@electric-ax/agents': patch +'@electric-ax/agents-desktop': patch +'@electric-ax/agents-runtime': patch +'@electric-ax/agents-server': patch +'@electric-ax/agents-server-ui': patch +--- + +Add OpenAI realtime voice mode for Electric Agents, backed by durable audio/control streams. Horton can enter realtime mode with normal context and tools, desktop exposes realtime model/voice/reasoning settings, the server/runtime persist session stream refs, transcripts, and audio spans, and the UI adds voice controls, typed-message forwarding, credential gating, input metering, new-session voice startup, and audio capture/playback fixes. diff --git a/packages/agents-desktop/package.json b/packages/agents-desktop/package.json index 63229257b8..0c2450adb1 100644 --- a/packages/agents-desktop/package.json +++ b/packages/agents-desktop/package.json @@ -27,6 +27,7 @@ "typecheck": "tsc --noEmit" }, "dependencies": { + "@electric-ax/agents-runtime": "workspace:*", "@electric-sql/client": "^1.5.20", "@mixmark-io/domino": "^2.2.0", "better-sqlite3": "^12.9.0", diff --git a/packages/agents-desktop/src/app/controller.ts b/packages/agents-desktop/src/app/controller.ts index 1560564d4b..0eb1f96f7b 100644 --- a/packages/agents-desktop/src/app/controller.ts +++ b/packages/agents-desktop/src/app/controller.ts @@ -11,6 +11,7 @@ import * as DesktopIpc from '../ipc/register' import { ensureRuntimeEntry as ensureRuntimeEntryInStore } from '../runtime/entries' import { createRuntimeController } from '../runtime/controller' import * as SettingsBootstrap from '../settings/bootstrap' +import * as RealtimeSettings from '../settings/realtime' import * as ServerSelection from '../settings/selection' import { saveDesktopSettings } from '../settings/store' import { desktopStateForWindow as desktopStateForWindowImpl } from '../state/desktop-state' @@ -30,6 +31,7 @@ import type { DesktopMenuSection, DesktopMenuState, DesktopState, + RealtimeSettings as RealtimeSettingsConfig, RuntimeEntry, ServerConfig, } from '../shared/types' @@ -328,6 +330,20 @@ export function createDesktopMainController(ctx: DesktopAppContext) { runtime.refreshPowerSaveBlocker() } + const getRealtimeSettingsStatus = async () => + await RealtimeSettings.realtimeSettingsStatus({ + settings, + apiKeys, + launchEnv: ctx.envApiKeysSnapshot, + }) + + const setRealtimeSettings = async ( + next: RealtimeSettingsConfig + ): Promise => { + settings.realtime = RealtimeSettings.normalizeRealtimeSettings(next) + await saveSettings() + } + const syncLaunchAtLoginSetting = async (): Promise => { await LoginItems.setLaunchAtLogin(settings.launchAtLogin === true) } @@ -438,6 +454,8 @@ export function createDesktopMainController(ctx: DesktopAppContext) { setLaunchAtLogin, getPreventAppSuspension, setPreventAppSuspension, + getRealtimeSettingsStatus, + setRealtimeSettings, } const loadSettings = (): Promise => diff --git a/packages/agents-desktop/src/ipc/preferences.ts b/packages/agents-desktop/src/ipc/preferences.ts index cfd50bab3f..abb98d51cc 100644 --- a/packages/agents-desktop/src/ipc/preferences.ts +++ b/packages/agents-desktop/src/ipc/preferences.ts @@ -2,6 +2,8 @@ import { ipcMain } from 'electron' import type { LaunchAtLoginStatus, PreventAppSuspensionPreference, + RealtimeSettings, + RealtimeSettingsStatus, } from '../shared/types' export type PreferencesIpcDeps = { @@ -9,6 +11,10 @@ export type PreferencesIpcDeps = { setLaunchAtLogin: (enabled: boolean) => Promise getPreventAppSuspension: () => PreventAppSuspensionPreference setPreventAppSuspension: (enabled: boolean) => Promise + getRealtimeSettingsStatus: () => + | RealtimeSettingsStatus + | Promise + setRealtimeSettings: (settings: RealtimeSettings) => Promise } export function registerPreferencesIpcHandlers(deps: PreferencesIpcDeps): void { @@ -25,4 +31,11 @@ export function registerPreferencesIpcHandlers(deps: PreferencesIpcDeps): void { `desktop:set-prevent-app-suspension`, (_event, enabled: boolean) => deps.setPreventAppSuspension(Boolean(enabled)) ) + ipcMain.handle(`desktop:get-realtime-settings`, () => + deps.getRealtimeSettingsStatus() + ) + ipcMain.handle( + `desktop:set-realtime-settings`, + (_event, settings: RealtimeSettings) => deps.setRealtimeSettings(settings) + ) } diff --git a/packages/agents-desktop/src/preload.ts b/packages/agents-desktop/src/preload.ts index 82c437a935..af780edb47 100644 --- a/packages/agents-desktop/src/preload.ts +++ b/packages/agents-desktop/src/preload.ts @@ -21,6 +21,8 @@ import type { McpServerConfig, OnboardingState, PreventAppSuspensionPreference, + RealtimeSettings, + RealtimeSettingsStatus, ServerConfig, } from './shared/types' import type { CloudAgentServersState } from './cloud/cloud-agent-servers' @@ -190,6 +192,10 @@ const api = { ipcRenderer.invoke(`desktop:get-prevent-app-suspension`), setPreventAppSuspension: (enabled: boolean): Promise => ipcRenderer.invoke(`desktop:set-prevent-app-suspension`, enabled), + getRealtimeSettings: (): Promise => + ipcRenderer.invoke(`desktop:get-realtime-settings`), + setRealtimeSettings: (settings: RealtimeSettings): Promise => + ipcRenderer.invoke(`desktop:set-realtime-settings`, settings), getWorkingDirectory: (): Promise => ipcRenderer.invoke(`desktop:get-working-directory`), chooseWorkingDirectory: (): Promise => diff --git a/packages/agents-desktop/src/settings/realtime.ts b/packages/agents-desktop/src/settings/realtime.ts new file mode 100644 index 0000000000..23456255b4 --- /dev/null +++ b/packages/agents-desktop/src/settings/realtime.ts @@ -0,0 +1,145 @@ +import { createHash } from 'node:crypto' +import type { + ApiKeys, + DesktopSettings, + RealtimeCredentialStatus, + RealtimeSettings, + RealtimeSettingsStatus, +} from '../shared/types' +import { + DEFAULT_OPENAI_REALTIME_MODEL, + DEFAULT_OPENAI_REALTIME_REASONING_EFFORT, + DEFAULT_OPENAI_REALTIME_VOICE, + OPENAI_REALTIME_MODELS, + OPENAI_REALTIME_REASONING_EFFORTS, + OPENAI_REALTIME_VOICES, + isOpenAIRealtimeModel, + isOpenAIRealtimeReasoningEffort, + isOpenAIRealtimeVoice, +} from '@electric-ax/agents-runtime' + +export const DEFAULT_REALTIME_SETTINGS: RealtimeSettings = { + provider: `openai`, + model: DEFAULT_OPENAI_REALTIME_MODEL, + voice: DEFAULT_OPENAI_REALTIME_VOICE, + reasoningEffort: DEFAULT_OPENAI_REALTIME_REASONING_EFFORT, + interruptResponse: true, +} + +const OPENAI_REALTIME_VALIDATION_TTL_MS = 5 * 60 * 1000 + +type RealtimeCredentialValidation = { + openAIApiKeyStatus: RealtimeCredentialStatus + openAIApiKeyError?: string +} + +const validationCache = new Map< + string, + { expiresAt: number; result: RealtimeCredentialValidation } +>() + +export function normalizeRealtimeSettings(value: unknown): RealtimeSettings { + if (!value || typeof value !== `object`) return DEFAULT_REALTIME_SETTINGS + const maybe = value as Partial> + return { + provider: `openai`, + model: isOpenAIRealtimeModel(maybe.model) + ? maybe.model + : DEFAULT_REALTIME_SETTINGS.model, + voice: isOpenAIRealtimeVoice(maybe.voice) + ? maybe.voice + : DEFAULT_REALTIME_SETTINGS.voice, + reasoningEffort: isOpenAIRealtimeReasoningEffort(maybe.reasoningEffort) + ? maybe.reasoningEffort + : DEFAULT_REALTIME_SETTINGS.reasoningEffort, + interruptResponse: + typeof maybe.interruptResponse === `boolean` + ? maybe.interruptResponse + : DEFAULT_REALTIME_SETTINGS.interruptResponse, + } +} + +function validationCacheKey(apiKey: string, model: string): string { + const keyHash = createHash(`sha256`).update(apiKey).digest(`hex`) + return `${keyHash}:${model}` +} + +async function validateOpenAIRealtimeApiKey( + apiKey: string | null | undefined, + model: string +): Promise { + if (!apiKey) { + return { openAIApiKeyStatus: `missing` } + } + + const cacheKey = validationCacheKey(apiKey, model) + const cached = validationCache.get(cacheKey) + if (cached && cached.expiresAt > Date.now()) return cached.result + + let result: RealtimeCredentialValidation + try { + const response = await fetch( + `https://api.openai.com/v1/models/${encodeURIComponent(model)}`, + { + headers: { Authorization: `Bearer ${apiKey}` }, + } + ) + if (response.ok) { + result = { openAIApiKeyStatus: `valid` } + } else if ( + response.status === 401 || + response.status === 403 || + response.status === 404 + ) { + result = { + openAIApiKeyStatus: `invalid`, + openAIApiKeyError: + response.status === 404 + ? `OpenAI API key cannot access ${model}.` + : `OpenAI API key was rejected (${response.status}).`, + } + } else { + result = { + openAIApiKeyStatus: `unknown`, + openAIApiKeyError: `OpenAI credential check failed (${response.status}).`, + } + } + } catch (error) { + result = { + openAIApiKeyStatus: `unknown`, + openAIApiKeyError: error instanceof Error ? error.message : String(error), + } + } + + validationCache.set(cacheKey, { + expiresAt: Date.now() + OPENAI_REALTIME_VALIDATION_TTL_MS, + result, + }) + return result +} + +export async function realtimeSettingsStatus({ + settings, + apiKeys, + launchEnv, +}: { + settings: DesktopSettings + apiKeys: ApiKeys + launchEnv: ApiKeys +}): Promise { + const normalized = normalizeRealtimeSettings(settings.realtime) + const apiKey = apiKeys.openai || launchEnv.openai + const validation = await validateOpenAIRealtimeApiKey( + apiKey, + normalized.model + ) + return { + settings: normalized, + availableModels: [...OPENAI_REALTIME_MODELS], + availableVoices: [...OPENAI_REALTIME_VOICES], + availableReasoningEfforts: [...OPENAI_REALTIME_REASONING_EFFORTS], + hasOpenAIApiKey: Boolean(apiKey), + ...validation, + codexEnabled: settings.codex?.enabled === true, + } +} diff --git a/packages/agents-desktop/src/settings/store.ts b/packages/agents-desktop/src/settings/store.ts index ece7ca46c2..9465066a61 100644 --- a/packages/agents-desktop/src/settings/store.ts +++ b/packages/agents-desktop/src/settings/store.ts @@ -17,11 +17,15 @@ import { saveApiKeysToSecret, } from '../credentials/api-keys' import { normalizeEnabledModelValues } from '../credentials/model-picker' +import { + DEFAULT_REALTIME_SETTINGS, + normalizeRealtimeSettings, +} from './realtime' import { normalizeServer, normalizeServers } from './servers' export { settingsPath } from '../shared/paths' -export const SETTINGS_VERSION = 2 +export const SETTINGS_VERSION = 3 export const DEFAULT_SETTINGS: DesktopSettings = { servers: [], @@ -31,6 +35,7 @@ export const DEFAULT_SETTINGS: DesktopSettings = { launchAtLogin: false, preventAppSuspension: true, codex: { enabled: false, source: null }, + realtime: DEFAULT_REALTIME_SETTINGS, } export function normalizeCodexSettings(value: unknown): CodexSettings { @@ -165,6 +170,7 @@ export async function loadDesktopSettings( preventAppSuspension: parsed.preventAppSuspension !== false, onboardingDismissed: parsed.onboardingDismissed === true, codex: normalizeCodexSettings(parsed.codex), + realtime: normalizeRealtimeSettings(parsed.realtime), enabledModelValues: enabledModelValues.length > 0 ? enabledModelValues : undefined, mcp: normalizeMcp(parsed.mcp), diff --git a/packages/agents-desktop/src/shared/types.ts b/packages/agents-desktop/src/shared/types.ts index 7f16145739..312a414014 100644 --- a/packages/agents-desktop/src/shared/types.ts +++ b/packages/agents-desktop/src/shared/types.ts @@ -4,6 +4,12 @@ import type { McpServerConfig, RegistrySnapshot, } from '@electric-ax/agents' +import type { + OpenAIRealtimeReasoningEffort, + RealtimeModelChoice, + RealtimeReasoningEffortChoice, + RealtimeVoiceChoice, +} from '@electric-ax/agents-runtime' export type ServerSource = `manual` | `local-discovery` | `electric-cloud` export type ServerDesiredState = `connected` | `disconnected` @@ -122,6 +128,33 @@ export type CodexSettings = { source: CodexAuthSource | null } +export type RealtimeProvider = `openai` + +export type RealtimeSettings = { + provider: RealtimeProvider + model: string + voice: string + reasoningEffort: OpenAIRealtimeReasoningEffort + interruptResponse: boolean +} + +export type RealtimeCredentialStatus = + | `missing` + | `valid` + | `invalid` + | `unknown` + +export type RealtimeSettingsStatus = { + settings: RealtimeSettings + availableModels: Array + availableVoices: Array + availableReasoningEfforts: Array + hasOpenAIApiKey: boolean + openAIApiKeyStatus: RealtimeCredentialStatus + openAIApiKeyError?: string + codexEnabled: boolean +} + export type DesktopSettings = { servers: Array defaultServerId: string | null @@ -131,6 +164,7 @@ export type DesktopSettings = { preventAppSuspension?: boolean codex?: CodexSettings enabledModelValues?: Array + realtime?: RealtimeSettings onboardingDismissed?: boolean mcp?: { servers: Array } seededDefaultMcpServerNames?: Array diff --git a/packages/agents-runtime/src/agents-client.ts b/packages/agents-runtime/src/agents-client.ts index dd3358abca..6e86b6f9a5 100644 --- a/packages/agents-runtime/src/agents-client.ts +++ b/packages/agents-runtime/src/agents-client.ts @@ -4,6 +4,10 @@ import { normalizeObservationSchema } from './observation-schema' import { createRuntimeServerClient } from './runtime-server-client' import { appendPathToUrl } from './url' import type { EntitySignal } from './runtime-server-client' +import type { + RealtimeSessionStartResult, + StartRealtimeSessionOptions, +} from './runtime-server-client' import type { EntitiesObservationSource, EntityObservationSource, @@ -31,6 +35,9 @@ export interface AgentsClient { payload?: unknown }) => Promise<{ txid: number }> kill: (entityUrl: string, reason?: string) => Promise<{ txid: number }> + startRealtimeSession: ( + options: StartRealtimeSessionOptions + ) => Promise } export function createAgentsClient(config: AgentsClientConfig): AgentsClient { @@ -44,6 +51,8 @@ export function createAgentsClient(config: AgentsClientConfig): AgentsClient { signal: `SIGKILL`, reason, }), + startRealtimeSession: (options) => + serverClient.startRealtimeSession(options), async observe(source) { if (source.sourceType === `entity`) { const info = await serverClient.getEntity( diff --git a/packages/agents-runtime/src/client.ts b/packages/agents-runtime/src/client.ts index c222461724..feb1197bbc 100644 --- a/packages/agents-runtime/src/client.ts +++ b/packages/agents-runtime/src/client.ts @@ -11,6 +11,24 @@ export { normalizeEntityTimelineData, normalizeTimelineEntities, } from './entity-timeline' +export { + DEFAULT_OPENAI_REALTIME_MODEL, + DEFAULT_OPENAI_REALTIME_REASONING_EFFORT, + DEFAULT_OPENAI_REALTIME_VOICE, + OPENAI_REALTIME_MODELS, + OPENAI_REALTIME_REASONING_EFFORTS, + OPENAI_REALTIME_VOICES, + isOpenAIRealtimeModel, + isOpenAIRealtimeReasoningEffort, + isOpenAIRealtimeVoice, +} from './realtime-options' +export type { + OpenAIRealtimeReasoningEffort, + RealtimeModelChoice, + RealtimeProviderId, + RealtimeReasoningEffortChoice, + RealtimeVoiceChoice, +} from './realtime-options' export { db, entities, @@ -48,6 +66,11 @@ export type { SlashCommandTrigger, } from './composer-input' export type { AgentsClient, AgentsClientConfig } from './agents-client' +export type { + RealtimeAudioOptions, + RealtimeSessionStartResult, + StartRealtimeSessionOptions, +} from './runtime-server-client' export type { AttachmentRole, AttachmentStatus, diff --git a/packages/agents-runtime/src/context-factory.ts b/packages/agents-runtime/src/context-factory.ts index 006b5c7241..8c1669a7e5 100644 --- a/packages/agents-runtime/src/context-factory.ts +++ b/packages/agents-runtime/src/context-factory.ts @@ -1,4 +1,5 @@ import { queryOnce } from '@durable-streams/state/db' +import { DurableStream } from '@durable-streams/client' import { assembleContext } from './context-assembly' import { createContextEntriesApi } from './context-entries' import { entityStateSchema } from './entity-schema' @@ -6,6 +7,7 @@ import { formatPointerOrderToken } from './event-pointer' import { createOutboundBridge, loadOutboundIdSeed } from './outbound-bridge' import { createPiAgentAdapter } from './pi-adapter' import { + defaultProjection, timelineMessages as runtimeTimelineMessages, timelineToMessages, } from './timeline-context' @@ -13,6 +15,7 @@ import { getCronStreamPath } from './cron-utils' import { runtimeLog } from './log' import { sliceChars } from './token-budget' import { createContextTools } from './tools/context-tools' +import { appendPathToUrl } from './url' import { CACHE_TIERS } from './types' import { composeToolsWithProviders } from './tool-providers' import { validateSlashCommandDefinitions } from './composer-input' @@ -42,8 +45,16 @@ import type { HandlerWake, LLMMessage, ManifestAttachmentEntry, + ManifestRealtimeSessionEntry, ObservationHandle, ObservationSource, + RealtimeAudioConfig, + RealtimeAudioFormat, + RealtimeConfig, + RealtimeHandle, + RealtimeProviderEvent, + RealtimeProviderSession, + RealtimeRunResult, RunHandle, SendResult, SharedStateHandle, @@ -56,6 +67,16 @@ import type { WakeSession, } from './types' +const REALTIME_MIN_INPUT_COMMIT_BYTES = 4_800 +const REALTIME_SESSION_SOFT_LIMIT_MS = 55 * 60 * 1000 +const REALTIME_AUDIO_SPAN_MAX_MS = 500 +const REALTIME_PCM16_BYTES_PER_SAMPLE = 2 +const REALTIME_DEFAULT_AUDIO_FORMAT: RealtimeAudioFormat = { + codec: `pcm16`, + sampleRate: 24_000, + channels: 1, +} + function agentModelId(model: AgentModel): string { return typeof model === `string` ? model : model.id } @@ -66,6 +87,582 @@ function agentModelProvider(config: AgentConfig): string { : config.model.provider } +function isRealtimeSessionManifest( + entry: unknown +): entry is ManifestRealtimeSessionEntry { + return ( + typeof entry === `object` && + entry !== null && + (entry as { kind?: unknown }).kind === `realtime-session` && + typeof (entry as { id?: unknown }).id === `string` + ) +} + +function realtimeManifestIsActive( + entry: ManifestRealtimeSessionEntry +): boolean { + return entry.status === `requested` || entry.status === `active` +} + +function getToolName(tool: AgentTool): string | null { + const name = (tool as { name?: unknown }).name + return typeof name === `string` ? name : null +} + +function applyRealtimeToolPolicy( + tools: Array, + policy: RealtimeConfig[`toolPolicy`] +): Array { + if (!policy) return tools + const allowed = new Set([...(policy.direct ?? []), ...(policy.confirm ?? [])]) + if (allowed.size === 0) return [] + return tools.filter((tool) => { + const name = getToolName(tool) + return name != null && allowed.has(name) + }) +} + +type RealtimeStreamConfig = NonNullable +type RealtimeControlInput = + | { type: `input_text`; text: string } + | { type: `input_audio.commit`; afterAudioBytes?: number } + | { type: `response.cancel` } + | { type: `output_audio.truncate`; itemId: string; audioEndMs: number } + | { type: `session.close`; reason?: string } +type RealtimeStreamIo = { + writeProviderEvent: (event: RealtimeProviderEvent) => Promise + close: () => Promise +} +type RealtimeAudioSpanDraft = { + stream: `input` | `output` + seq: number + producerId: string + producerEpoch: number + byteStart: number + byteEnd: number + sampleStart: number + sampleCount: number + sampleRate: number + channels: number + timingSource: `runtime` | `provider` + createdAt: string + capturedAt?: string + receivedAt?: string + participantId?: string + providerItemId?: string + responseId?: string +} + +function trackRealtimeAppend( + pending: Set>, + append: Promise, + onError: (error: unknown) => void +): void { + let tracked: Promise + tracked = append.catch(onError).finally(() => { + pending.delete(tracked) + }) + pending.add(tracked) +} + +function isRealtimeControlInput(value: unknown): value is RealtimeControlInput { + if (!value || typeof value !== `object`) return false + const type = (value as { type?: unknown }).type + if (type === `output_audio.truncate`) { + return ( + typeof (value as { itemId?: unknown }).itemId === `string` && + typeof (value as { audioEndMs?: unknown }).audioEndMs === `number` + ) + } + if (type === `input_audio.commit`) { + const afterAudioBytes = (value as { afterAudioBytes?: unknown }) + .afterAudioBytes + return ( + afterAudioBytes === undefined || + (typeof afterAudioBytes === `number` && + Number.isFinite(afterAudioBytes) && + afterAudioBytes >= 0) + ) + } + if (type === `input_text`) { + return typeof (value as { text?: unknown }).text === `string` + } + return type === `response.cancel` || type === `session.close` +} + +function realtimeDurableStream( + streams: RealtimeStreamConfig, + path: string, + contentType: string +): DurableStream { + return new DurableStream({ + url: appendPathToUrl(streams.baseUrl, path), + headers: streams.headers, + contentType, + batching: true, + }) +} + +function jsonBytes(value: unknown): Uint8Array { + return new TextEncoder().encode(JSON.stringify(value)) +} + +function realtimeControlOutput(event: RealtimeProviderEvent): unknown { + if (event.type !== `output_audio.delta`) return event + return { + type: event.type, + responseId: event.responseId, + itemId: event.itemId, + byteLength: event.audio.byteLength, + } +} + +function useManualRealtimeInputCommits( + audio: RealtimeAudioConfig | undefined +): boolean { + return audio?.turnDetection === false || audio?.turnDetection?.type === `none` +} + +function realtimeByteOffset(byte: number): string { + return `byte:${byte}` +} + +function realtimeAudioFrameBytes(format: RealtimeAudioFormat): number { + return REALTIME_PCM16_BYTES_PER_SAMPLE * format.channels +} + +function realtimeAudioSamples( + byteLength: number, + format: RealtimeAudioFormat +): number { + return Math.floor(byteLength / realtimeAudioFrameBytes(format)) +} + +function createRealtimeStreamIo( + config: HandlerContextConfig, + session: ManifestRealtimeSessionEntry | undefined, + providerSession: RealtimeProviderSession, + audio: RealtimeAudioConfig | undefined +): RealtimeStreamIo | undefined { + if (!config.realtimeStreams || !session) return undefined + + const logPrefix = `[agent-runtime]` + const abort = new AbortController() + const abortFromRun = (): void => abort.abort() + if (config.runSignal?.aborted) { + abort.abort() + } else { + config.runSignal?.addEventListener(`abort`, abortFromRun, { once: true }) + } + + const audioIn = realtimeDurableStream( + config.realtimeStreams, + session.streams.audio_in, + `audio/pcm` + ) + const audioOut = realtimeDurableStream( + config.realtimeStreams, + session.streams.audio_out, + `audio/pcm` + ) + const controlIn = realtimeDurableStream( + config.realtimeStreams, + session.streams.control_in, + `application/json` + ) + const controlOut = realtimeDurableStream( + config.realtimeStreams, + session.streams.control_out, + `application/json` + ) + const tasks: Array> = [] + let audioInChunks = 0 + let audioInBytes = 0 + let committedAudioInBytes = 0 + let controlInCommands = 0 + let audioOutChunks = 0 + let audioOutBytes = 0 + let controlOutEvents = 0 + const pendingOutputAppends = new Set>() + const pendingInputCommits: Array<{ afterAudioBytes?: number }> = [] + const pendingAudioChunks: Array<{ + start: number + end: number + data: Uint8Array + }> = [] + const inputAudioFormat = audio?.inputFormat ?? REALTIME_DEFAULT_AUDIO_FORMAT + const outputAudioFormat = audio?.outputFormat ?? REALTIME_DEFAULT_AUDIO_FORMAT + const audioSpanDrafts: Partial< + Record<`input` | `output`, RealtimeAudioSpanDraft> + > = {} + let inputAudioSpanSeq = 0 + let outputAudioSpanSeq = 0 + let processingInputCommits = false + const manualInputCommits = useManualRealtimeInputCommits(audio) + + const trackOutputAppend = (append: Promise, label: string): void => { + trackRealtimeAppend(pendingOutputAppends, append, (error) => { + if (!abort.signal.aborted) { + runtimeLog.warn(logPrefix, `${label}:`, error) + } + }) + } + + const flushAudioSpan = (stream: `input` | `output`): void => { + const draft = audioSpanDrafts[stream] + if (!draft || draft.byteEnd <= draft.byteStart) return + audioSpanDrafts[stream] = undefined + config.writeEvent( + entityStateSchema.realtimeAudioSpans.insert({ + key: `realtime-audio-span:${session.id}:${stream}:${draft.seq}`, + value: { + session_id: session.id, + stream, + producer_id: draft.producerId, + producer_epoch: draft.producerEpoch, + seq: draft.seq, + offset: realtimeByteOffset(draft.byteStart), + next_offset: realtimeByteOffset(draft.byteEnd), + byte_start: draft.byteStart, + byte_end: draft.byteEnd, + byte_length: draft.byteEnd - draft.byteStart, + sample_start: draft.sampleStart, + sample_count: draft.sampleCount, + sample_rate: draft.sampleRate, + channels: draft.channels, + codec: `pcm16`, + timing_source: draft.timingSource, + created_at: draft.createdAt, + ...(draft.capturedAt ? { captured_at: draft.capturedAt } : {}), + ...(draft.receivedAt ? { received_at: draft.receivedAt } : {}), + ...(draft.participantId + ? { participant_id: draft.participantId } + : {}), + ...(draft.providerItemId + ? { provider_item_id: draft.providerItemId } + : {}), + ...(draft.responseId ? { response_id: draft.responseId } : {}), + } as never, + }) as ChangeEvent + ) + } + + const appendAudioSpan = (input: { + stream: `input` | `output` + byteStart: number + byteLength: number + format: RealtimeAudioFormat + producerId: string + timingSource: `runtime` | `provider` + capturedAt?: string + receivedAt?: string + participantId?: string + providerItemId?: string + responseId?: string + }): void => { + if (input.byteLength <= 0) return + const frameBytes = realtimeAudioFrameBytes(input.format) + const byteEnd = input.byteStart + input.byteLength + const sampleStart = Math.floor(input.byteStart / frameBytes) + const sampleCount = realtimeAudioSamples(input.byteLength, input.format) + const maxSampleCount = Math.max( + 1, + Math.floor((input.format.sampleRate * REALTIME_AUDIO_SPAN_MAX_MS) / 1000) + ) + const draft = audioSpanDrafts[input.stream] + const compatible = + draft && + draft.producerId === input.producerId && + draft.timingSource === input.timingSource && + draft.participantId === input.participantId && + draft.providerItemId === input.providerItemId && + draft.responseId === input.responseId && + draft.byteEnd === input.byteStart && + draft.sampleRate === input.format.sampleRate && + draft.channels === input.format.channels && + draft.sampleCount + sampleCount <= maxSampleCount + + if (compatible) { + draft.byteEnd = byteEnd + draft.sampleCount += sampleCount + draft.receivedAt = input.receivedAt ?? draft.receivedAt + return + } + + flushAudioSpan(input.stream) + const seq = + input.stream === `input` ? inputAudioSpanSeq++ : outputAudioSpanSeq++ + audioSpanDrafts[input.stream] = { + stream: input.stream, + seq, + producerId: input.producerId, + producerEpoch: config.epoch, + byteStart: input.byteStart, + byteEnd, + sampleStart, + sampleCount, + sampleRate: input.format.sampleRate, + channels: input.format.channels, + timingSource: input.timingSource, + createdAt: new Date().toISOString(), + capturedAt: input.capturedAt, + receivedAt: input.receivedAt, + participantId: input.participantId, + providerItemId: input.providerItemId, + responseId: input.responseId, + } + } + + const discardCommittedAudioChunks = (): void => { + while ( + pendingAudioChunks.length > 0 && + pendingAudioChunks[0]!.end <= committedAudioInBytes + ) { + pendingAudioChunks.shift() + } + } + + const appendAudioRangeToProvider = async ( + start: number, + end: number + ): Promise => { + if (!providerSession.appendInputAudio) return + for (const chunk of pendingAudioChunks) { + if (chunk.end <= start) continue + if (chunk.start >= end) break + const sliceStart = Math.max(0, start - chunk.start) + const sliceEnd = Math.min(chunk.data.byteLength, end - chunk.start) + if (sliceEnd <= sliceStart) continue + await providerSession.appendInputAudio( + chunk.data.subarray(sliceStart, sliceEnd) + ) + } + } + + const processPendingInputCommits = async (): Promise => { + if (processingInputCommits) return + processingInputCommits = true + try { + while (pendingInputCommits.length > 0) { + const command = pendingInputCommits[0]! + const commitAudioBytes = command.afterAudioBytes ?? audioInBytes + if (audioInBytes < commitAudioBytes) return + + pendingInputCommits.shift() + if (commitAudioBytes <= committedAudioInBytes) { + runtimeLog.info( + logPrefix, + `realtime input_audio.commit ignored session=${session.id} audioInBytes=${audioInBytes} committedAudioInBytes=${committedAudioInBytes} commitAudioBytes=${commitAudioBytes}` + ) + continue + } + + const pendingAudioBytes = commitAudioBytes - committedAudioInBytes + if (pendingAudioBytes < REALTIME_MIN_INPUT_COMMIT_BYTES) { + runtimeLog.info( + logPrefix, + `realtime input_audio.commit skipped session=${session.id} audioInBytes=${audioInBytes} committedAudioInBytes=${committedAudioInBytes} commitAudioBytes=${commitAudioBytes}` + ) + await providerSession.clearInputAudio?.() + committedAudioInBytes = commitAudioBytes + discardCommittedAudioChunks() + continue + } + + await appendAudioRangeToProvider( + committedAudioInBytes, + commitAudioBytes + ) + await providerSession.commitInputAudio?.() + committedAudioInBytes = commitAudioBytes + discardCommittedAudioChunks() + } + } finally { + processingInputCommits = false + } + } + + runtimeLog.info( + logPrefix, + `realtime stream bridge starting session=${session.id} inputMode=${manualInputCommits ? `manual-commit` : `provider-vad`} audioIn=${session.streams.audio_in} audioOut=${session.streams.audio_out}` + ) + + if (providerSession.appendInputAudio) { + tasks.push( + (async () => { + const response = await audioIn.stream({ + live: true, + signal: abort.signal, + warnOnHttp: false, + }) + try { + for await (const chunk of response.bodyStream()) { + if (abort.signal.aborted) break + const nextChunkCount = audioInChunks + 1 + if (nextChunkCount === 1) { + runtimeLog.info( + logPrefix, + `realtime audio/in first chunk session=${session.id} bytes=${chunk.byteLength}` + ) + } + const start = audioInBytes + audioInChunks = nextChunkCount + audioInBytes += chunk.byteLength + appendAudioSpan({ + stream: `input`, + byteStart: start, + byteLength: chunk.byteLength, + format: inputAudioFormat, + producerId: session.streams.audio_in, + timingSource: `runtime`, + participantId: `user`, + receivedAt: new Date().toISOString(), + }) + if (manualInputCommits) { + pendingAudioChunks.push({ + start, + end: start + chunk.byteLength, + data: chunk, + }) + await processPendingInputCommits() + } else { + await providerSession.appendInputAudio?.(chunk) + } + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + runtimeLog.warn( + `[agent-runtime] realtime audio/in pump failed:`, + error + ) + } + }) + ) + } + + tasks.push( + (async () => { + const response = await controlIn.stream({ + live: true, + signal: abort.signal, + json: true, + warnOnHttp: false, + }) + try { + for await (const command of response.jsonStream()) { + if (abort.signal.aborted || !isRealtimeControlInput(command)) { + continue + } + controlInCommands += 1 + if (controlInCommands === 1) { + runtimeLog.info( + logPrefix, + `realtime control/in first command session=${session.id} type=${command.type}` + ) + } + switch (command.type) { + case `input_text`: + await providerSession.sendText?.(command.text) + break + case `input_audio.commit`: + if (manualInputCommits) { + pendingInputCommits.push({ + afterAudioBytes: command.afterAudioBytes, + }) + await processPendingInputCommits() + } else { + runtimeLog.info( + logPrefix, + `realtime input_audio.commit ignored in provider-vad mode session=${session.id}` + ) + } + break + case `response.cancel`: + await providerSession.cancelResponse?.() + break + case `output_audio.truncate`: + await providerSession.truncateOutputAudio?.({ + itemId: command.itemId, + audioEndMs: command.audioEndMs, + }) + break + case `session.close`: + await providerSession.close?.(command.reason) + abort.abort() + break + } + } + } finally { + response.cancel() + } + })().catch((error) => { + if (!abort.signal.aborted) { + runtimeLog.warn( + `[agent-runtime] realtime control/in pump failed:`, + error + ) + } + }) + ) + + return { + async writeProviderEvent(event) { + controlOutEvents += 1 + if (controlOutEvents === 1) { + runtimeLog.info( + logPrefix, + `realtime provider first event session=${session.id} type=${event.type}` + ) + } + if (event.type === `output_audio.delta`) { + const byteStart = audioOutBytes + audioOutChunks += 1 + audioOutBytes += event.audio.byteLength + if (audioOutChunks === 1) { + runtimeLog.info( + logPrefix, + `realtime audio/out first chunk session=${session.id} bytes=${event.audio.byteLength}` + ) + } + appendAudioSpan({ + stream: `output`, + byteStart, + byteLength: event.audio.byteLength, + format: outputAudioFormat, + producerId: session.streams.audio_out, + timingSource: `provider`, + participantId: `assistant`, + providerItemId: event.itemId, + responseId: event.responseId, + receivedAt: new Date().toISOString(), + }) + trackOutputAppend( + audioOut.append(event.audio), + `realtime audio/out append failed` + ) + } + trackOutputAppend( + controlOut.append(jsonBytes(realtimeControlOutput(event))), + `realtime control/out append failed` + ) + }, + async close() { + abort.abort() + config.runSignal?.removeEventListener(`abort`, abortFromRun) + await Promise.allSettled([...tasks, ...pendingOutputAppends]) + flushAudioSpan(`input`) + flushAudioSpan(`output`) + runtimeLog.info( + logPrefix, + `realtime stream bridge closed session=${session.id} audioInChunks=${audioInChunks} audioInBytes=${audioInBytes} controlInCommands=${controlInCommands} providerEvents=${controlOutEvents} audioOutChunks=${audioOutChunks} audioOutBytes=${audioOutBytes}` + ) + }, + } +} + const MAX_HYDRATED_IMAGE_ATTACHMENTS = 4 const MAX_HYDRATED_IMAGE_ATTACHMENT_BYTES = 10 * 1024 * 1024 @@ -97,6 +694,10 @@ export interface HandlerContextConfig { }) => void | Promise ) => void hydratedEventSourceWake?: HydratedEventSourceWake | null + realtimeStreams?: { + baseUrl: string + headers?: Record + } doObserve: ( source: ObservationSource, wake?: Wake @@ -447,6 +1048,8 @@ export function createHandlerContext( ): HandlerContextResult { let sleepRequested = false let agentConfig: AgentConfig | null = null + let realtimeConfig: RealtimeConfig | null = null + let activeRealtimeProviderSession: RealtimeProviderSession | null = null let useContextConfig: UseContextConfig | null = null let useContextHash = `` let useContextRegistrations = 0 @@ -514,6 +1117,85 @@ export function createHandlerContext( }, } + function realtimeSessions(): Array { + const sessions: Array = [] + for (const entry of config.db.collections.manifests.toArray) { + if (isRealtimeSessionManifest(entry)) { + sessions.push(entry) + } + } + return sessions.sort((a, b) => a.startedAt.localeCompare(b.startedAt)) + } + + function activeRealtimeSession(): ManifestRealtimeSessionEntry | undefined { + return realtimeSessions().filter(realtimeManifestIsActive).at(-1) + } + + async function updateRealtimeSessionStatus( + session: ManifestRealtimeSessionEntry | undefined, + status: `active` | `closed` | `failed`, + opts: { reason?: string; error?: string } = {} + ): Promise { + if (!session) return + + const key = session.key ?? `realtime-session:${session.id}` + const terminal = status === `closed` || status === `failed` + const endedAt = terminal ? new Date().toISOString() : session.endedAt + const meta = { + ...(session.meta ?? {}), + ...(opts.reason ? { reason: opts.reason } : {}), + ...(opts.error ? { error: opts.error } : {}), + } + + const nextSession: ManifestRealtimeSessionEntry = { + key, + kind: `realtime-session`, + id: session.id, + provider: session.provider, + model: session.model, + ...(session.voice ? { voice: session.voice } : {}), + ...(session.reasoningEffort + ? { reasoningEffort: session.reasoningEffort } + : {}), + ...(typeof session.interruptResponse === `boolean` + ? { interruptResponse: session.interruptResponse } + : {}), + status, + startedAt: session.startedAt, + endedAt: endedAt ?? null, + streams: session.streams, + retention: `forever`, + ...(Object.keys(meta).length > 0 ? { meta } : {}), + } + + config.wakeSession.registerManifestEntry(nextSession) + config.writeEvent( + entityStateSchema.realtimeSessions.update({ + key, + value: { + session_id: session.id, + provider: session.provider, + model: session.model, + ...(session.voice ? { voice: session.voice } : {}), + ...(session.reasoningEffort + ? { reasoning_effort: session.reasoningEffort } + : {}), + ...(typeof session.interruptResponse === `boolean` + ? { interrupt_response: session.interruptResponse } + : {}), + status, + started_at: session.startedAt, + ...(endedAt ? { ended_at: endedAt } : {}), + streams: session.streams, + ...(opts.reason ? { reason: opts.reason } : {}), + ...(opts.error ? { error: opts.error } : {}), + ...(Object.keys(meta).length > 0 ? { meta } : {}), + } as never, + }) as ChangeEvent + ) + await config.wakeSession.commitManifestEntries() + } + function structuralHash(nextConfig: UseContextConfig): string { const sources = Object.entries(nextConfig.sources) .sort(([leftName], [rightName]) => leftName.localeCompare(rightName)) @@ -911,6 +1593,801 @@ export function createHandlerContext( }, } + const realtimeHandle: RealtimeHandle = { + async run(): Promise { + if (!realtimeConfig) { + throw new Error( + `[agent-runtime] realtime.run() called without useRealtime().` + ) + } + + if (config.prepareAgentRun) { + await config.prepareAgentRun() + } + + const activeRealtimeConfig = realtimeConfig + const bridge = createOutboundBridge( + await loadOutboundIdSeed(config.db), + config.writeEvent + ) + const startedAt = Date.now() + let textStarted = false + let currentToolCall: + | { toolCallId: string; name: string; args: unknown } + | undefined + const realtimeSession = activeRealtimeSession() + + const endText = (): void => { + if (!textStarted) return + bridge.onTextEnd() + textStarted = false + } + + const emitText = (delta: string): void => { + if (delta.length === 0) return + if (!textStarted) { + bridge.onTextStart() + textStarted = true + } + bridge.onTextDelta(delta) + } + + const transcriptTextByKey = new Map() + const transcriptCreatedAtByKey = new Map() + const transcriptDeltaSeqByKey = new Map() + const transcriptFallbackIds = new Map<`input` | `output`, string>() + const inputTranscriptKeyByTurnId = new Map() + const outputTranscriptKeyByResponseId = new Map() + const outputTranscriptKeysByResponseId = new Map>() + const outputTranscriptSegmentByResponseId = new Map() + const outputTranscriptSourceByKey = new Map() + let transcriptFallbackCounter = 0 + let pendingInputTranscriptKey: string | undefined + let activeOutputTranscript: + | { key: string; responseId?: string } + | undefined + let providerSessionId = realtimeSession?.id + + const currentTranscriptSessionId = (): string => + realtimeSession?.id ?? providerSessionId ?? `ephemeral` + + const transcriptKey = ( + direction: `input` | `output`, + id?: string + ): string => { + let stableId = id + if (!stableId) { + stableId = transcriptFallbackIds.get(direction) + if (!stableId) { + stableId = `fallback-${transcriptFallbackCounter}` + transcriptFallbackCounter += 1 + transcriptFallbackIds.set(direction, stableId) + } + } + return `realtime-transcript:${currentTranscriptSessionId()}:${direction}:${stableId}` + } + + const inputTranscriptKey = (turnId?: string): string => { + if (turnId) { + const existing = inputTranscriptKeyByTurnId.get(turnId) + if (existing) return existing + if (pendingInputTranscriptKey) { + inputTranscriptKeyByTurnId.set(turnId, pendingInputTranscriptKey) + return pendingInputTranscriptKey + } + const key = transcriptKey(`input`, turnId) + inputTranscriptKeyByTurnId.set(turnId, key) + return key + } + const key = pendingInputTranscriptKey ?? transcriptKey(`input`) + pendingInputTranscriptKey = key + return key + } + + const trackOutputTranscriptKey = ( + responseId: string | undefined, + key: string + ): void => { + activeOutputTranscript = { key, responseId } + if (!responseId) return + const keys = outputTranscriptKeysByResponseId.get(responseId) ?? [] + if (!keys.includes(key)) { + keys.push(key) + outputTranscriptKeysByResponseId.set(responseId, keys) + } + } + + const outputTranscriptKey = (responseId?: string): string => { + if (responseId) { + const existing = outputTranscriptKeyByResponseId.get(responseId) + if (existing) return existing + const key = transcriptKey(`output`, responseId) + outputTranscriptKeyByResponseId.set(responseId, key) + trackOutputTranscriptKey(responseId, key) + return key + } + const key = activeOutputTranscript?.responseId + ? transcriptKey(`output`) + : (activeOutputTranscript?.key ?? transcriptKey(`output`)) + trackOutputTranscriptKey(undefined, key) + return key + } + + const rotateActiveOutputTranscript = (): void => { + const active = activeOutputTranscript + if (!active) return + const text = transcriptTextByKey.get(active.key) ?? `` + if (text.length === 0) return + + if (active.responseId) { + const nextSegment = + (outputTranscriptSegmentByResponseId.get(active.responseId) ?? 0) + + 1 + outputTranscriptSegmentByResponseId.set( + active.responseId, + nextSegment + ) + const key = transcriptKey( + `output`, + `${active.responseId}:segment-${nextSegment}` + ) + outputTranscriptKeyByResponseId.set(active.responseId, key) + trackOutputTranscriptKey(active.responseId, key) + return + } + + transcriptFallbackIds.delete(`output`) + activeOutputTranscript = undefined + } + + const outputTranscriptSourceRank = (source: string): number => { + switch (source) { + case `response.output_audio_transcript`: + return 3 + case `response.audio_transcript`: + return 2 + case `response.output_text`: + return 1 + default: + return 0 + } + } + + const outputTranscriptSourceKey = (input: { + responseId?: string + itemId?: string + contentIndex?: number + }): string | undefined => { + if (input.responseId) { + return `${input.responseId}:${input.itemId ?? ``}:${input.contentIndex ?? 0}` + } + if (input.itemId) { + return `${input.itemId}:${input.contentIndex ?? 0}` + } + return undefined + } + + const resetOutputTranscriptText = ( + responseId: string | undefined + ): void => { + const keys = responseId + ? (outputTranscriptKeysByResponseId.get(responseId) ?? []) + : activeOutputTranscript + ? [activeOutputTranscript.key] + : [] + for (const key of keys) { + transcriptTextByKey.set(key, ``) + deleteRealtimeTranscriptDeltas(key) + } + } + + const shouldUseOutputTranscriptSource = (input: { + responseId?: string + itemId?: string + contentIndex?: number + transcriptSource?: string + }): boolean => { + if (!input.transcriptSource) return true + const key = outputTranscriptSourceKey(input) + if (!key) return true + const existing = outputTranscriptSourceByKey.get(key) + if (!existing) { + outputTranscriptSourceByKey.set(key, input.transcriptSource) + return true + } + if (existing === input.transcriptSource) return true + if ( + outputTranscriptSourceRank(input.transcriptSource) > + outputTranscriptSourceRank(existing) + ) { + outputTranscriptSourceByKey.set(key, input.transcriptSource) + resetOutputTranscriptText(input.responseId) + return true + } + return false + } + + const writeRealtimeTranscript = (input: { + direction: `input` | `output` + key: string + text: string + status: `partial` | `final` + turnId?: string + responseId?: string + allowEmpty?: boolean + }): void => { + const collection = config.db.collections.realtimeTranscripts + if ( + input.text.length === 0 && + !input.allowEmpty && + !collection.has(input.key) + ) { + return + } + + const existing = collection.get(input.key) as + | { created_at?: string } + | undefined + const createdAt = + transcriptCreatedAtByKey.get(input.key) ?? + existing?.created_at ?? + new Date().toISOString() + transcriptCreatedAtByKey.set(input.key, createdAt) + + const value = { + session_id: currentTranscriptSessionId(), + direction: input.direction, + text: input.text, + status: input.status, + audio_stream: input.direction, + ...(input.turnId ? { turn_id: input.turnId } : {}), + ...(input.responseId ? { response_id: input.responseId } : {}), + created_at: createdAt, + } + config.writeEvent( + (collection.has(input.key) + ? entityStateSchema.realtimeTranscripts.update({ + key: input.key, + value: value as never, + }) + : entityStateSchema.realtimeTranscripts.insert({ + key: input.key, + value: value as never, + })) as ChangeEvent + ) + + emitRealtimeTranscript(input) + } + + const emitRealtimeTranscript = (input: { + direction: `input` | `output` + key: string + text: string + status: `partial` | `final` + turnId?: string + responseId?: string + }): void => { + const onTranscript = activeRealtimeConfig.onTranscript + if (!onTranscript) return + void Promise.resolve( + onTranscript({ + key: input.key, + sessionId: currentTranscriptSessionId(), + direction: input.direction, + text: input.text, + status: input.status, + ...(input.turnId ? { turnId: input.turnId } : {}), + ...(input.responseId ? { responseId: input.responseId } : {}), + }) + ).catch((error) => { + runtimeLog.warn( + `[agent-runtime]`, + `realtime transcript callback failed:`, + error + ) + }) + } + + const writeRealtimeTranscriptDelta = (input: { + key: string + delta: string + }): void => { + if (input.delta.length === 0) return + const seq = transcriptDeltaSeqByKey.get(input.key) ?? 0 + transcriptDeltaSeqByKey.set(input.key, seq + 1) + config.writeEvent( + entityStateSchema.textDeltas.insert({ + key: `${input.key}:delta-${seq}`, + value: { + text_id: input.key, + realtime_transcript_id: input.key, + delta: input.delta, + } as never, + }) as ChangeEvent + ) + } + + const deleteRealtimeTranscriptDeltas = (key: string): void => { + const deltaCount = transcriptDeltaSeqByKey.get(key) ?? 0 + for (let index = 0; index < deltaCount; index += 1) { + config.writeEvent( + entityStateSchema.textDeltas.delete({ + key: `${key}:delta-${index}`, + }) as ChangeEvent + ) + } + transcriptDeltaSeqByKey.set(key, 0) + } + + const reconcileRealtimeTranscriptDeltas = ( + key: string, + finalText: string + ): void => { + const currentText = transcriptTextByKey.get(key) ?? `` + if (finalText === currentText) return + if (finalText.startsWith(currentText)) { + writeRealtimeTranscriptDelta({ + key, + delta: finalText.slice(currentText.length), + }) + return + } + deleteRealtimeTranscriptDeltas(key) + writeRealtimeTranscriptDelta({ key, delta: finalText }) + } + + const beginRealtimeTranscript = (input: { + direction: `input` | `output` + turnId?: string + responseId?: string + }): void => { + const key = + input.direction === `input` + ? inputTranscriptKey(input.turnId) + : outputTranscriptKey(input.responseId) + const existing = config.db.collections.realtimeTranscripts.get(key) as + | { text?: string } + | undefined + const text = transcriptTextByKey.get(key) ?? existing?.text ?? `` + transcriptTextByKey.set(key, text) + writeRealtimeTranscript({ + direction: input.direction, + key, + text, + status: `partial`, + turnId: input.turnId, + responseId: input.responseId, + allowEmpty: true, + }) + } + + const appendRealtimeTranscript = (input: { + direction: `input` | `output` + delta: string + turnId?: string + responseId?: string + itemId?: string + contentIndex?: number + transcriptSource?: string + }): void => { + if (input.delta.length === 0) return + if ( + input.direction === `output` && + !shouldUseOutputTranscriptSource(input) + ) { + return + } + const key = + input.direction === `input` + ? inputTranscriptKey(input.turnId) + : outputTranscriptKey(input.responseId) + const text = `${transcriptTextByKey.get(key) ?? ``}${input.delta}` + transcriptTextByKey.set(key, text) + if (!config.db.collections.realtimeTranscripts.has(key)) { + writeRealtimeTranscript({ + direction: input.direction, + key, + text: ``, + status: `partial`, + turnId: input.turnId, + responseId: input.responseId, + allowEmpty: true, + }) + } + writeRealtimeTranscriptDelta({ key, delta: input.delta }) + emitRealtimeTranscript({ + direction: input.direction, + key, + text, + status: `partial`, + turnId: input.turnId, + responseId: input.responseId, + }) + } + + const completeRealtimeTranscript = (input: { + direction: `input` | `output` + text?: string + turnId?: string + responseId?: string + }): void => { + const key = + input.direction === `input` + ? inputTranscriptKey(input.turnId) + : outputTranscriptKey(input.responseId) + const text = input.text ?? transcriptTextByKey.get(key) ?? `` + reconcileRealtimeTranscriptDeltas(key, text) + transcriptTextByKey.set(key, text) + writeRealtimeTranscript({ + direction: input.direction, + key, + text, + status: `final`, + turnId: input.turnId, + responseId: input.responseId, + }) + if ( + (input.direction === `input` && !input.turnId) || + (input.direction === `output` && !input.responseId) + ) { + transcriptFallbackIds.delete(input.direction) + } + if (input.direction === `input` && pendingInputTranscriptKey === key) { + pendingInputTranscriptKey = undefined + if (input.turnId) { + transcriptFallbackIds.delete(`input`) + } + } + } + + const completeOutputTranscript = (input: { + text?: string + responseId?: string + itemId?: string + contentIndex?: number + transcriptSource?: string + }): void => { + if (!shouldUseOutputTranscriptSource(input)) return + const existingKeys = input.responseId + ? outputTranscriptKeysByResponseId.get(input.responseId) + : activeOutputTranscript + ? [activeOutputTranscript.key] + : undefined + const keys = + existingKeys && existingKeys.length > 0 + ? existingKeys + : [outputTranscriptKey(input.responseId)] + + for (const [index, key] of keys.entries()) { + const existing = config.db.collections.realtimeTranscripts.get( + key + ) as { text?: string } | undefined + const text = + keys.length === 1 && input.text !== undefined + ? input.text + : (transcriptTextByKey.get(key) ?? + existing?.text ?? + (index === keys.length - 1 ? (input.text ?? ``) : ``)) + reconcileRealtimeTranscriptDeltas(key, text) + transcriptTextByKey.set(key, text) + writeRealtimeTranscript({ + direction: `output`, + key, + text, + status: `final`, + responseId: input.responseId, + }) + } + + if (!input.responseId) { + transcriptFallbackIds.delete(`output`) + } + if ( + activeOutputTranscript && + activeOutputTranscript.responseId === input.responseId + ) { + activeOutputTranscript = undefined + } + } + + const composedTools = (await composeToolsWithProviders( + activeRealtimeConfig.tools ?? [] + )) as Array + const providerTools = applyRealtimeToolPolicy( + composedTools, + activeRealtimeConfig.toolPolicy + ) + const activeRealtimeSessionId = realtimeSession?.id + let realtimeCloseReason: string | undefined + const messages = + activeRealtimeConfig.context?.includeTimeline === false + ? [] + : await hydrateAttachmentBlocks( + runtimeTimelineMessages(config.db, { + projection: (item) => { + if ( + item.kind === `realtime_transcript` && + item.sessionId === activeRealtimeSessionId + ) { + return null + } + return defaultProjection(item) + }, + }).map(({ at: _at, ...message }) => message as LLMMessage) + ) + let realtimeIo: RealtimeStreamIo | undefined + let realtimeSessionTerminalWritten = false + let realtimeSessionLimitTimer: ReturnType | undefined + + async function handleProviderEvent( + event: RealtimeProviderEvent + ): Promise { + switch (event.type) { + case `session.started`: + providerSessionId = + realtimeSession?.id ?? event.sessionId ?? providerSessionId + break + + case `session.updated`: + case `output_audio.delta`: + case `output_audio.completed`: + case `response.started`: + case `response.cancelled`: + break + + case `input_audio.speech_started`: + rotateActiveOutputTranscript() + beginRealtimeTranscript({ + direction: `input`, + turnId: event.turnId, + }) + break + + case `input_audio.speech_stopped`: + if (event.turnId || pendingInputTranscriptKey) { + beginRealtimeTranscript({ + direction: `input`, + turnId: event.turnId, + }) + } + break + + case `input_audio.committed`: + beginRealtimeTranscript({ + direction: `input`, + turnId: event.turnId, + }) + break + + case `input_transcript.delta`: + appendRealtimeTranscript({ + direction: `input`, + delta: event.delta, + turnId: event.turnId, + }) + break + + case `input_transcript.completed`: + completeRealtimeTranscript({ + direction: `input`, + text: event.text, + turnId: event.turnId, + }) + break + + case `session.closed`: + realtimeCloseReason = event.reason + endText() + break + + case `response.completed`: + endText() + break + + case `session.error`: + if (event.code === `response_cancel_not_active`) { + runtimeLog.warn( + `[agent-runtime]`, + `realtime provider ignored inactive response cancellation: ${event.error}` + ) + break + } + if ( + event.code === `invalid_value` && + event.error.includes(`Audio content`) && + event.error.includes(`already shorter than`) + ) { + runtimeLog.warn( + `[agent-runtime]`, + `realtime provider ignored stale output audio truncate: ${event.error}` + ) + break + } + throw new Error( + `[agent-runtime] realtime provider error${event.code ? ` ${event.code}` : ``}: ${event.error}` + ) + + case `output_transcript.delta`: + appendRealtimeTranscript({ + direction: `output`, + delta: event.delta, + responseId: event.responseId, + itemId: event.itemId, + contentIndex: event.contentIndex, + transcriptSource: event.transcriptSource, + }) + break + + case `output_transcript.completed`: + completeOutputTranscript({ + text: event.text, + responseId: event.responseId, + itemId: event.itemId, + contentIndex: event.contentIndex, + transcriptSource: event.transcriptSource, + }) + break + + case `tool_call.started`: + currentToolCall = { + toolCallId: event.toolCallId, + name: event.name, + args: event.args, + } + if (event.args !== undefined) { + bridge.onToolCallStart(event.toolCallId, event.name, event.args) + } + break + + case `tool_call.arguments_delta`: + break + + case `tool_call.arguments_completed`: + currentToolCall = { + toolCallId: event.toolCallId, + name: event.name, + args: event.args, + } + bridge.onToolCallStart(event.toolCallId, event.name, event.args) + break + + case `tool_call.completed`: { + if (currentToolCall?.toolCallId !== event.toolCallId) { + bridge.onToolCallStart(event.toolCallId, event.name, undefined) + } + bridge.onToolCallEnd( + event.toolCallId, + event.name, + event.result, + event.isError ?? false + ) + break + } + } + } + + try { + bridge.onRunStart() + bridge.onStepStart({ + modelProvider: activeRealtimeConfig.provider.id, + modelId: activeRealtimeConfig.provider.model, + }) + + if (activeRealtimeConfig.testResponses) { + const messageText = getTriggerMessageText( + config.db, + config.wakeEvent, + config.events, + config.wakeOffset, + config.hydratedEventSourceWake + ) + const responses = activeRealtimeConfig.testResponses + if (Array.isArray(responses)) { + const priorRunCount = ( + await queryOnce((q) => + q.from({ runs: config.db.collections.runs }) + ) + ).length + emitText( + responses[priorRunCount % Math.max(responses.length, 1)] ?? `` + ) + } else { + const response = await responses(messageText, bridge) + if (response !== undefined) emitText(response) + } + endText() + } else { + activeRealtimeProviderSession = + await activeRealtimeConfig.provider.connect({ + systemPrompt: activeRealtimeConfig.systemPrompt, + messages, + tools: providerTools, + audio: activeRealtimeConfig.audio, + session: realtimeSession, + signal: config.runSignal, + }) + realtimeSessionLimitTimer = setTimeout(() => { + runtimeLog.info( + `[agent-runtime]`, + `realtime session soft limit reached session=${realtimeSession?.id ?? `ephemeral`}` + ) + void activeRealtimeProviderSession?.close?.( + `session-duration-limit` + ) + }, REALTIME_SESSION_SOFT_LIMIT_MS) + await updateRealtimeSessionStatus(realtimeSession, `active`) + realtimeIo = createRealtimeStreamIo( + config, + realtimeSession, + activeRealtimeProviderSession, + activeRealtimeConfig.audio + ) + + for await (const event of activeRealtimeProviderSession.events) { + if (config.runSignal?.aborted) { + break + } + await realtimeIo?.writeProviderEvent(event) + await handleProviderEvent(event) + } + } + + endText() + await updateRealtimeSessionStatus(realtimeSession, `closed`, { + reason: config.runSignal?.aborted + ? `aborted` + : (realtimeCloseReason ?? `completed`), + }) + realtimeSessionTerminalWritten = true + bridge.onStepEnd({ + finishReason: config.runSignal?.aborted ? `aborted` : `stop`, + durationMs: Date.now() - startedAt, + }) + bridge.onRunEnd({ + finishReason: config.runSignal?.aborted ? `aborted` : `stop`, + }) + } catch (error) { + endText() + if (!realtimeSessionTerminalWritten) { + await updateRealtimeSessionStatus(realtimeSession, `failed`, { + error: error instanceof Error ? error.message : String(error), + }) + realtimeSessionTerminalWritten = true + } + bridge.onStepEnd({ + finishReason: `error`, + durationMs: Date.now() - startedAt, + }) + bridge.onRunEnd({ finishReason: `error` }) + throw error + } finally { + if (realtimeSessionLimitTimer) { + clearTimeout(realtimeSessionLimitTimer) + } + await realtimeIo?.close() + activeRealtimeProviderSession = null + } + + return { + writes: [], + toolCalls: [], + usage: { tokens: 0, duration: Date.now() - startedAt }, + } + }, + async close(reason?: string): Promise { + await activeRealtimeProviderSession?.close?.(reason) + }, + async stop(reason?: string): Promise { + await this.close(reason) + }, + async cancelResponse(): Promise { + await activeRealtimeProviderSession?.cancelResponse?.() + }, + async sendText(text: string): Promise { + await activeRealtimeProviderSession?.sendText?.(text) + }, + } + const ctx: DebugHandlerContext = { firstWake: config.firstWake, wake: toHandlerWake(config.wakeEvent), @@ -931,6 +2408,10 @@ export function createHandlerContext( agentConfig = cfg return agent }, + useRealtime(cfg) { + realtimeConfig = cfg + return realtimeHandle + }, useContext(nextConfig) { assertValidUseContextConfig(nextConfig) const hash = structuralHash(nextConfig) @@ -951,6 +2432,10 @@ export function createHandlerContext( useContextRegistrations: () => useContextRegistrations, }, agent, + realtime: { + activeSession: activeRealtimeSession, + sessions: realtimeSessions, + }, observe: ((source: ObservationSource, opts?: { wake?: Wake }) => { return config.doObserve(source, opts?.wake) as Promise< ObservationHandle & EntityHandle & SharedStateHandle diff --git a/packages/agents-runtime/src/entity-schema.ts b/packages/agents-runtime/src/entity-schema.ts index 7d70d3cef2..3374ded0b4 100644 --- a/packages/agents-runtime/src/entity-schema.ts +++ b/packages/agents-runtime/src/entity-schema.ts @@ -11,6 +11,7 @@ import type { } from '@standard-schema/spec' import type { SlashCommandRow } from './composer-input' import type { JsonValue } from './types' +import type { OpenAIRealtimeReasoningEffort } from './realtime-options' // ============================================================================ // Passthrough Schema Utility @@ -163,7 +164,8 @@ type TextValue = { type TextDeltaValue = { key?: string text_id: string - run_id: string + run_id?: string + realtime_transcript_id?: string delta: string } type ToolCallValue = { @@ -347,6 +349,91 @@ type ManifestFutureSendScheduleEntryValue = { failedAt?: string lastError?: string } +type RealtimeSessionStatusValue = + | `requested` + | `active` + | `closing` + | `closed` + | `failed` +type RealtimeSessionStreamRefsValue = { + audio_in: string + audio_out: string + control_in: string + control_out: string +} +type ManifestRealtimeSessionEntryValue = { + key?: string + kind: `realtime-session` + id: string + provider: string + model: string + voice?: string + reasoningEffort?: OpenAIRealtimeReasoningEffort + interruptResponse?: boolean + status: RealtimeSessionStatusValue + startedAt: string + endedAt?: string | null + streams: RealtimeSessionStreamRefsValue + retention: `forever` + meta?: Record +} +type RealtimeSessionValue = { + key?: string + session_id: string + provider: string + model: string + voice?: string + reasoning_effort?: OpenAIRealtimeReasoningEffort + interrupt_response?: boolean + status: RealtimeSessionStatusValue + started_at: string + ended_at?: string + streams: RealtimeSessionStreamRefsValue + reason?: string + error?: string + meta?: Record +} +type RealtimeAudioSpanValue = { + key?: string + session_id: string + stream: `input` | `output` + producer_id: string + producer_epoch: number + seq: number + offset: string + next_offset?: string + byte_start?: number + byte_end?: number + byte_length: number + sample_start: number + sample_count: number + sample_rate: number + channels: number + codec: `pcm16` + timing_source: `client` | `runtime` | `provider` + captured_at?: string + received_at?: string + participant_id?: string + turn_id?: string + provider_item_id?: string + response_id?: string + created_at: string +} +type RealtimeTranscriptValue = { + key?: string + session_id: string + direction: `input` | `output` + text: string + status: `partial` | `final` + turn_id?: string + response_id?: string + audio_stream?: `input` | `output` + audio_offset?: string + audio_next_offset?: string + sample_start?: number + sample_end?: number + created_at: string +} type ReplayWatermarkValue = { key?: string source_id: string @@ -488,7 +575,8 @@ function createTextDeltaSchema(): Schema { key: z.string().optional(), ...timelineOrderField, text_id: z.string(), - run_id: z.string(), + run_id: z.string().optional(), + realtime_transcript_id: z.string().optional(), delta: z.string(), }) } @@ -707,6 +795,20 @@ function createContextRemovedSchema(): Schema { timestamp: z.string(), }) } + +function createRealtimeSessionStreamRefsSchema(): Schema { + return z.object({ + audio_in: z.string(), + audio_out: z.string(), + control_in: z.string(), + control_out: z.string(), + }) +} + +function createRealtimeSessionStatusSchema() { + return z.enum([`requested`, `active`, `closing`, `closed`, `failed`]) +} + function createManifestSchema(): Schema< | ManifestChildEntryValue | ManifestSourceEntryValue @@ -716,6 +818,7 @@ function createManifestSchema(): Schema< | ManifestContextEntryValue | ManifestCronScheduleEntryValue | ManifestFutureSendScheduleEntryValue + | ManifestRealtimeSessionEntryValue > { return z.union([ z.object({ @@ -818,6 +921,23 @@ function createManifestSchema(): Schema< failedAt: z.string().optional(), lastError: z.string().optional(), }), + z.object({ + key: z.string().optional(), + ...timelineOrderField, + kind: z.literal(`realtime-session`), + id: z.string(), + provider: z.string(), + model: z.string(), + voice: z.string().optional(), + reasoningEffort: z.enum([`low`, `medium`, `high`]).optional(), + interruptResponse: z.boolean().optional(), + status: createRealtimeSessionStatusSchema(), + startedAt: z.string(), + endedAt: z.string().nullable().optional(), + streams: createRealtimeSessionStreamRefsSchema(), + retention: z.literal(`forever`).default(`forever`), + meta: createJsonObjectSchema().optional(), + }), ]) as unknown as Schema< | ManifestChildEntryValue | ManifestSourceEntryValue @@ -827,9 +947,79 @@ function createManifestSchema(): Schema< | ManifestContextEntryValue | ManifestCronScheduleEntryValue | ManifestFutureSendScheduleEntryValue + | ManifestRealtimeSessionEntryValue > } +function createRealtimeSessionSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + session_id: z.string(), + provider: z.string(), + model: z.string(), + voice: z.string().optional(), + reasoning_effort: z.enum([`low`, `medium`, `high`]).optional(), + interrupt_response: z.boolean().optional(), + status: createRealtimeSessionStatusSchema(), + started_at: z.string(), + ended_at: z.string().optional(), + streams: createRealtimeSessionStreamRefsSchema(), + reason: z.string().optional(), + error: z.string().optional(), + meta: createJsonObjectSchema().optional(), + }) +} + +function createRealtimeAudioSpanSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + session_id: z.string(), + stream: z.enum([`input`, `output`]), + producer_id: z.string(), + producer_epoch: z.number().int().nonnegative(), + seq: z.number().int().nonnegative(), + offset: z.string(), + next_offset: z.string().optional(), + byte_start: z.number().int().nonnegative().optional(), + byte_end: z.number().int().nonnegative().optional(), + byte_length: z.number().int().nonnegative(), + sample_start: z.number().int().nonnegative(), + sample_count: z.number().int().nonnegative(), + sample_rate: z.number().int().positive(), + channels: z.number().int().positive(), + codec: z.literal(`pcm16`), + timing_source: z.enum([`client`, `runtime`, `provider`]), + captured_at: z.string().optional(), + received_at: z.string().optional(), + participant_id: z.string().optional(), + turn_id: z.string().optional(), + provider_item_id: z.string().optional(), + response_id: z.string().optional(), + created_at: z.string(), + }) +} + +function createRealtimeTranscriptSchema(): Schema { + return z.object({ + key: z.string().optional(), + ...timelineOrderField, + session_id: z.string(), + direction: z.enum([`input`, `output`]), + text: z.string(), + status: z.enum([`partial`, `final`]), + turn_id: z.string().optional(), + response_id: z.string().optional(), + audio_stream: z.enum([`input`, `output`]).optional(), + audio_offset: z.string().optional(), + audio_next_offset: z.string().optional(), + sample_start: z.number().int().nonnegative().optional(), + sample_end: z.number().int().nonnegative().optional(), + created_at: z.string(), + }) +} + function createReplayWatermarkSchema(): Schema { return z.object({ key: z.string().optional(), @@ -881,6 +1071,10 @@ export type ManifestCronScheduleEntry = SequencedPersistedRow export type ManifestFutureSendScheduleEntry = SequencedPersistedRow +export type RealtimeSessionStatus = RealtimeSessionStatusValue +export type RealtimeSessionStreamRefs = RealtimeSessionStreamRefsValue +export type ManifestRealtimeSessionEntry = + SequencedPersistedRow type ManifestUnion = | ManifestChildEntry | ManifestSourceEntry @@ -890,6 +1084,7 @@ type ManifestUnion = | ManifestContextEntry | ManifestCronScheduleEntry | ManifestFutureSendScheduleEntry + | ManifestRealtimeSessionEntry export type Manifest = ManifestUnion & { id?: string entity_url?: string @@ -921,11 +1116,23 @@ export type Manifest = ManifestUnion & { targetUrl?: string producerId?: string messageType?: string - status?: FutureSendScheduleStatus | AttachmentStatusValue + status?: + | FutureSendScheduleStatus + | AttachmentStatusValue + | RealtimeSessionStatusValue sentAt?: string failedAt?: string lastError?: string -} + provider?: string + model?: string + startedAt?: string + endedAt?: string | null + streams?: RealtimeSessionStreamRefs + retention?: `forever` +} +export type RealtimeSession = SequencedPersistedRow +export type RealtimeAudioSpan = SequencedPersistedRow +export type RealtimeTranscript = SequencedPersistedRow export type ReplayWatermark = SequencedPersistedRow // ============================================================================ @@ -949,6 +1156,9 @@ export const ENTITY_COLLECTIONS = { tags: `tags`, slashCommands: `slashCommands`, manifests: `manifests`, + realtimeSessions: `realtimeSessions`, + realtimeAudioSpans: `realtimeAudioSpans`, + realtimeTranscripts: `realtimeTranscripts`, contextInserted: `contextInserted`, contextRemoved: `contextRemoved`, replayWatermarks: `replayWatermarks`, @@ -982,6 +1192,12 @@ export const BUILT_IN_EVENT_SCHEMAS = { context_removed: createContextRemovedSchema() as unknown as BuiltInEntitySchema, manifest: createManifestSchema() as unknown as BuiltInEntitySchema, + realtime_session: + createRealtimeSessionSchema() as unknown as BuiltInEntitySchema, + realtime_audio_span: + createRealtimeAudioSpanSchema() as unknown as BuiltInEntitySchema, + realtime_transcript: + createRealtimeTranscriptSchema() as unknown as BuiltInEntitySchema, replay_watermark: createReplayWatermarkSchema() as unknown as BuiltInEntitySchema, } as const @@ -1008,6 +1224,9 @@ type EntityCollectionsDefinition = { tags: CollectionDefinition slashCommands: CollectionDefinition manifests: CollectionDefinition + realtimeSessions: CollectionDefinition + realtimeAudioSpans: CollectionDefinition + realtimeTranscripts: CollectionDefinition contextInserted: CollectionDefinition contextRemoved: CollectionDefinition replayWatermarks: CollectionDefinition @@ -1104,6 +1323,24 @@ export const builtInCollections: EntityCollectionsDefinition = { type: `manifest`, primaryKey: `key`, }, + realtimeSessions: { + schema: + BUILT_IN_EVENT_SCHEMAS.realtime_session as StandardSchemaV1, + type: `realtime_session`, + primaryKey: `key`, + }, + realtimeAudioSpans: { + schema: + BUILT_IN_EVENT_SCHEMAS.realtime_audio_span as StandardSchemaV1, + type: `realtime_audio_span`, + primaryKey: `key`, + }, + realtimeTranscripts: { + schema: + BUILT_IN_EVENT_SCHEMAS.realtime_transcript as StandardSchemaV1, + type: `realtime_transcript`, + primaryKey: `key`, + }, contextInserted: { schema: BUILT_IN_EVENT_SCHEMAS.context_inserted as StandardSchemaV1, @@ -1140,6 +1377,8 @@ const MANAGEMENT_TYPES = new Set([ `entity_created`, `signal`, `manifest`, + `realtime_session`, + `realtime_audio_span`, `replay_watermark`, `ack`, ]) diff --git a/packages/agents-runtime/src/entity-stream-db.ts b/packages/agents-runtime/src/entity-stream-db.ts index a1b7cc7d02..2995d411d8 100644 --- a/packages/agents-runtime/src/entity-stream-db.ts +++ b/packages/agents-runtime/src/entity-stream-db.ts @@ -9,6 +9,7 @@ import { createTransaction, getStreamDBCollectionId, } from '@durable-streams/state/db' +import { BasicIndex } from '@tanstack/db' import { builtInCollections, passthrough } from './entity-schema' import { formatPointerOrderToken, type EventPointer } from './event-pointer' import type { @@ -105,6 +106,36 @@ type EntityStreamDBOptions = { const WRITE_TXID_TIMEOUT_MS = 20_000 +function createCollectionIndex( + collection: unknown, + indexCallback: (row: Record) => unknown +): void { + const createIndex = ( + collection as { + createIndex?: ( + indexCallback: (row: Record) => unknown, + config: { indexType: typeof BasicIndex } + ) => unknown + } + ).createIndex + if (typeof createIndex === `function`) { + createIndex.call(collection, indexCallback, { indexType: BasicIndex }) + } +} + +function createEntityTimelineIndexes(collections: EntityCollections): void { + createCollectionIndex(collections.texts, (row) => row.run_id) + createCollectionIndex(collections.textDeltas, (row) => row.text_id) + createCollectionIndex(collections.textDeltas, (row) => row.run_id) + createCollectionIndex( + collections.textDeltas, + (row) => row.realtime_transcript_id + ) + createCollectionIndex(collections.toolCalls, (row) => row.run_id) + createCollectionIndex(collections.steps, (row) => row.run_id) + createCollectionIndex(collections.errors, (row) => row.run_id) +} + /** * Create a StreamDB connected to a Electric Agents entity stream. * @@ -492,6 +523,7 @@ export function createEntityStreamDB( } replayDb.__electricReplayBatchOffset = replayBatchOffset replayDb.__electricReplaySourceId = streamUrl + createEntityTimelineIndexes(replayDb.collections) const pendingWritePersistences = new Set>() let nextWriteSequence = 0 const pendingWriteSequences = new Set() diff --git a/packages/agents-runtime/src/entity-timeline.ts b/packages/agents-runtime/src/entity-timeline.ts index 0520982298..a2886c39d3 100644 --- a/packages/agents-runtime/src/entity-timeline.ts +++ b/packages/agents-runtime/src/entity-timeline.ts @@ -8,6 +8,7 @@ import { isNull, like, localOnlyCollectionOptions, + min, or, toArray, } from '@durable-streams/state/db' @@ -19,7 +20,12 @@ import type { } from '@tanstack/db' import type { EntityStreamDB } from './entity-stream-db' import { formatPointerOrderToken, type EventPointer } from './event-pointer' -import type { ChildStatusEntry, MessageReceived, Signal } from './entity-schema' +import type { + ChildStatusEntry, + MessageReceived, + RealtimeTranscript, + Signal, +} from './entity-schema' import type { ManifestEntry, Wake, WakeMessage } from './types' export type EntityTimelineState = @@ -133,6 +139,13 @@ export type IncludesSignal = Omit & { order: TimelineOrder } +export type IncludesRealtimeTranscript = Omit< + RealtimeTranscript, + `_seq` | `_timeline_order` +> & { + order: TimelineOrder +} + export interface IncludesContextInserted { key: string order: TimelineOrder @@ -169,6 +182,7 @@ export interface EntityTimelineData { inbox: Array wakes: Array signals: Array + realtimeTranscripts?: Array contextInserted: Array contextRemoved: Array entities: Array @@ -250,6 +264,7 @@ export interface EntityTimelineRunRow { export type EntityTimelineInboxRow = IncludesInboxMessage export type EntityTimelineWakeRow = IncludesWakeMessage export type EntityTimelineSignalRow = IncludesSignal +export type EntityTimelineRealtimeTranscriptRow = IncludesRealtimeTranscript export type EntityTimelineQueryRow = | { @@ -258,6 +273,7 @@ export type EntityTimelineQueryRow = run?: undefined wake?: undefined signal?: undefined + realtimeTranscript?: undefined manifest?: undefined } | { @@ -266,6 +282,7 @@ export type EntityTimelineQueryRow = run: EntityTimelineRunRow wake?: undefined signal?: undefined + realtimeTranscript?: undefined manifest?: undefined } | { @@ -274,6 +291,7 @@ export type EntityTimelineQueryRow = run?: undefined wake: EntityTimelineWakeRow signal?: undefined + realtimeTranscript?: undefined manifest?: undefined } | { @@ -282,6 +300,16 @@ export type EntityTimelineQueryRow = run?: undefined wake?: undefined signal: EntityTimelineSignalRow + realtimeTranscript?: undefined + manifest?: undefined + } + | { + $key: string + inbox?: undefined + run?: undefined + wake?: undefined + signal?: undefined + realtimeTranscript: EntityTimelineRealtimeTranscriptRow manifest?: undefined } | { @@ -290,6 +318,7 @@ export type EntityTimelineQueryRow = run?: undefined wake?: undefined signal?: undefined + realtimeTranscript?: undefined manifest: ManifestEntry } @@ -405,6 +434,9 @@ export function normalizeEntityTimelineData( inbox: data.inbox, wakes: data.wakes, signals: data.signals ?? [], + realtimeTranscripts: [...(data.realtimeTranscripts ?? [])].sort( + compareTimelineOrder + ), contextInserted: data.contextInserted, contextRemoved: data.contextRemoved, entities: normalizeTimelineEntities(data.entities), @@ -441,6 +473,9 @@ type WakeRow = OrderedValue< type SignalRow = OrderedValue< EntityStreamDB[`collections`][`signals`][`toArray`][number] > +type RealtimeTranscriptValueRow = + EntityStreamDB[`collections`][`realtimeTranscripts`][`toArray`][number] +type RealtimeTranscriptRow = OrderedValue type ContextInsertedValueRow = EntityStreamDB[`collections`][`contextInserted`][`toArray`][number] type ContextRemovedValueRow = @@ -593,6 +628,23 @@ function getOrderableCollection( return collection } +function getOptionalOrderableCollection( + collection: + | { + id?: string + toArray: Array + __electricRowOffsets?: Map + } + | undefined, + id: string +): { + id?: string + toArray: Array + __electricRowOffsets?: Map +} { + return collection ?? { id, toArray: [] } +} + function createOrderIndex( groups: ReadonlyArray> ): Map { @@ -710,6 +762,25 @@ function buildTextContentById( return deltasById } +function buildRealtimeTranscriptContentById( + textDeltas: Array +): Map { + const deltasById = new Map() + + for (const delta of [...textDeltas].sort(compareTimelineOrder)) { + const transcriptId = + (delta as { realtime_transcript_id?: string }).realtime_transcript_id ?? + (!delta.run_id ? delta.text_id : undefined) + if (!transcriptId) continue + deltasById.set( + transcriptId, + `${deltasById.get(transcriptId) ?? ``}${delta.delta}` + ) + } + + return deltasById +} + function buildIncludesRuns(input: { runs: Array texts: Array @@ -853,6 +924,25 @@ function buildSignalMessages(signals: Array): Array { }) } +function buildRealtimeTranscriptMessages( + transcripts: Array, + textDeltas: Array = [] +): Array { + const textContentById = buildRealtimeTranscriptContentById(textDeltas) + return [...transcripts].sort(compareTimelineOrder).map((transcript) => { + const { + _seq: _ignoredSeq, + _timeline_order: _ignoredTimelineOrder, + ...value + } = transcript + return { + ...value, + order: transcript.order, + text: textContentById.get(transcript.key) ?? transcript.text, + } + }) +} + function buildContextInsertedMessages( entries: Array ): Array { @@ -971,6 +1061,14 @@ export function buildEntityTimelineData( const inbox = withOrderToken(db.collections.inbox) const wakes = withOrderToken(db.collections.wakes) const signals = withOrderToken(db.collections.signals) + const realtimeTranscripts = withOrderToken( + getOptionalOrderableCollection( + db.collections.realtimeTranscripts as + | typeof db.collections.realtimeTranscripts + | undefined, + `realtimeTranscripts` + ) + ) const contextInserted = withOrderToken( getOrderableCollection( db.collections.contextInserted as @@ -1018,6 +1116,7 @@ export function buildEntityTimelineData( inbox, wakes, signals, + realtimeTranscripts, contextInserted, contextRemoved, manifests.filter(hasOrderToken), @@ -1035,6 +1134,10 @@ export function buildEntityTimelineData( inbox: buildInboxMessages(withOrderFromOrderIndex(inbox, orderIndex)), wakes: buildWakeMessages(withOrderFromOrderIndex(wakes, orderIndex)), signals: buildSignalMessages(withOrderFromOrderIndex(signals, orderIndex)), + realtimeTranscripts: buildRealtimeTranscriptMessages( + withOrderFromOrderIndex(realtimeTranscripts, orderIndex), + withOrderFromOrderIndex(textDeltas, orderIndex) + ), contextInserted: buildContextInsertedMessages( withOrderAndHistoryOffsetFromOrderIndex(contextInserted, orderIndex) ), @@ -1187,6 +1290,43 @@ const getEntitySignalsCollection = cachedCollectionFactory( }) ) +const getEntityRealtimeTranscriptsCollection = cachedCollectionFactory( + (db: EntityStreamDB) => + createLiveQueryCollection({ + id: `${String(db.collections.realtimeTranscripts.id)}:realtime-transcripts-live`, + query: (q) => + q + .from({ realtimeTranscript: db.collections.realtimeTranscripts }) + .select(({ realtimeTranscript }) => ({ + timelineKey: TIMELINE_KEY, + key: realtimeTranscript.key, + order: coalesce(realtimeTranscript._seq, -1), + session_id: realtimeTranscript.session_id, + direction: realtimeTranscript.direction, + text: concat( + toArray( + q + .from({ delta: db.collections.textDeltas }) + .where(({ delta }) => + eq(delta.realtime_transcript_id, realtimeTranscript.key) + ) + .orderBy(({ delta }) => coalesce(delta._seq, -1)) + .select(({ delta }) => delta.delta) + ) + ), + status: realtimeTranscript.status, + turn_id: realtimeTranscript.turn_id, + response_id: realtimeTranscript.response_id, + audio_stream: realtimeTranscript.audio_stream, + audio_offset: realtimeTranscript.audio_offset, + audio_next_offset: realtimeTranscript.audio_next_offset, + sample_start: realtimeTranscript.sample_start, + sample_end: realtimeTranscript.sample_end, + created_at: realtimeTranscript.created_at, + })), + }) +) + type EntityTimelineQueryBuilder = (q: InitialQueryBuilder) => QueryBuilder /** @@ -1285,6 +1425,36 @@ function buildEntityTimelineQuery( new_state: signal.new_state, })) + const realtimeTranscriptSource = q + .from({ realtimeTranscript: db.collections.realtimeTranscripts }) + .select(({ realtimeTranscript }) => ({ + key: realtimeTranscript.key, + order: coalesce(realtimeTranscript._timeline_order, `~`), + session_id: realtimeTranscript.session_id, + direction: realtimeTranscript.direction, + text: concat( + toArray( + q + .from({ delta: db.collections.textDeltas }) + .where(({ delta }) => + eq(delta.realtime_transcript_id, realtimeTranscript.key) + ) + .orderBy(({ delta }) => coalesce(delta._timeline_order, `~`)) + .orderBy(({ delta }) => delta.key) + .select(({ delta }) => delta.delta) + ) + ), + status: realtimeTranscript.status, + turn_id: realtimeTranscript.turn_id, + response_id: realtimeTranscript.response_id, + audio_stream: realtimeTranscript.audio_stream, + audio_offset: realtimeTranscript.audio_offset, + audio_next_offset: realtimeTranscript.audio_next_offset, + sample_start: realtimeTranscript.sample_start, + sample_end: realtimeTranscript.sample_end, + created_at: realtimeTranscript.created_at, + })) + const runItemsSource = q .unionAll({ text: db.collections.texts, @@ -1322,59 +1492,72 @@ function buildEntityTimelineQuery( }), })) - const runSource = q.from({ run: db.collections.runs }).select(({ run }) => ({ - key: run.key, - order: coalesce(run._timeline_order, `~`), - status: run.status, - finish_reason: run.finish_reason, - items: q - .from({ item: runItemsSource }) - .where(({ item }) => eq(item.run_id, run.key)) - .orderBy(({ item }) => item.order) - .orderBy(({ item }) => - coalesce( - caseWhen(item.text.key, `text`), - caseWhen(item.toolCall.key, `toolCall`), - `` + const runItemAnchorSource = q + .from({ item: runItemsSource }) + .groupBy(({ item }) => item.run_id) + .select(({ item }) => ({ + run_id: item.run_id, + order: min(item.order), + })) + + const runSource = q + .from({ run: db.collections.runs }) + .leftJoin({ anchor: runItemAnchorSource }, ({ run, anchor }) => + eq(anchor.run_id, run.key) + ) + .select(({ run, anchor }) => ({ + key: run.key, + order: coalesce(anchor.order, run._timeline_order, `~`), + status: run.status, + finish_reason: run.finish_reason, + items: q + .from({ item: runItemsSource }) + .where(({ item }) => eq(item.run_id, run.key)) + .orderBy(({ item }) => item.order) + .orderBy(({ item }) => + coalesce( + caseWhen(item.text.key, `text`), + caseWhen(item.toolCall.key, `toolCall`), + `` + ) ) - ) - .orderBy(({ item }) => coalesce(item.text.key, item.toolCall.key, ``)) - .select(({ item }) => ({ - text: caseWhen(item.text.key, { - key: item.text.key, - run_id: item.text.run_id, - order: item.text.order, - status: item.text.status, - content: item.textContent, - }), - toolCall: item.toolCall, - })), - steps: q - .from({ step: db.collections.steps }) - .where(({ step }) => eq(step.run_id, run.key)) - .orderBy(({ step }) => step.step_number) - .orderBy(({ step }) => coalesce(step._timeline_order, `~`)) - .orderBy(({ step }) => step.key) - .select(({ step }) => ({ - key: step.key, - run_id: step.run_id, - order: coalesce(step._timeline_order, `~`), - step_number: step.step_number, - status: step.status, - model_id: step.model_id, - duration_ms: step.duration_ms, - })), - errors: q - .from({ error: db.collections.errors }) - .where(({ error }) => eq(error.run_id, run.key)) - .orderBy(({ error }) => error.key) - .select(({ error }) => ({ - key: error.key, - run_id: error.run_id, - error_code: error.error_code, - message: error.message, - })), - })) + .orderBy(({ item }) => coalesce(item.text.key, item.toolCall.key, ``)) + .select(({ item }) => ({ + text: caseWhen(item.text.key, { + key: item.text.key, + run_id: item.text.run_id, + order: item.text.order, + status: item.text.status, + content: item.textContent, + }), + toolCall: item.toolCall, + })), + steps: q + .from({ step: db.collections.steps }) + .where(({ step }) => eq(step.run_id, run.key)) + .orderBy(({ step }) => step.step_number) + .orderBy(({ step }) => coalesce(step._timeline_order, `~`)) + .orderBy(({ step }) => step.key) + .select(({ step }) => ({ + key: step.key, + run_id: step.run_id, + order: coalesce(step._timeline_order, `~`), + step_number: step.step_number, + status: step.status, + model_id: step.model_id, + duration_ms: step.duration_ms, + })), + errors: q + .from({ error: db.collections.errors }) + .where(({ error }) => eq(error.run_id, run.key)) + .orderBy(({ error }) => error.key) + .select(({ error }) => ({ + key: error.key, + run_id: error.run_id, + error_code: error.error_code, + message: error.message, + })), + })) return q .unionAll({ @@ -1382,30 +1565,41 @@ function buildEntityTimelineQuery( run: runSource, wake: wakeSource, signal: signalSource, + realtimeTranscript: realtimeTranscriptSource, manifest: db.collections.manifests, }) - .orderBy(({ inbox, run, wake, signal, manifest }) => + .orderBy(({ inbox, run, wake, signal, realtimeTranscript, manifest }) => coalesce( inbox.order, run.order, wake.order, signal.order, + realtimeTranscript.order, manifest._timeline_order, `~` ) ) - .orderBy(({ inbox, run, wake, signal, manifest }) => + .orderBy(({ inbox, run, wake, signal, realtimeTranscript, manifest }) => coalesce( caseWhen(inbox.key, `inbox`), caseWhen(run.key, `run`), caseWhen(wake.key, `wake`), caseWhen(signal.key, `signal`), + caseWhen(realtimeTranscript.key, `realtimeTranscript`), caseWhen(manifest.key, `manifest`), `` ) ) - .orderBy(({ inbox, run, wake, signal, manifest }) => - coalesce(inbox.key, run.key, wake.key, signal.key, manifest.key, ``) + .orderBy(({ inbox, run, wake, signal, realtimeTranscript, manifest }) => + coalesce( + inbox.key, + run.key, + wake.key, + signal.key, + realtimeTranscript.key, + manifest.key, + `` + ) ) } @@ -1419,6 +1613,8 @@ export function createEntityIncludesQuery( const inboxCollection = getEntityInboxCollection(db) const wakesCollection = getEntityWakesCollection(db) const signalsCollection = getEntitySignalsCollection(db) + const realtimeTranscriptsCollection = + getEntityRealtimeTranscriptsCollection(db) const entitiesCollection = getEntityEntitiesCollection(db) return (q: InitialQueryBuilder) => @@ -1558,6 +1754,30 @@ export function createEntityIncludesQuery( new_state: signal.new_state, })) ), + realtimeTranscripts: toArray( + q + .from({ realtimeTranscript: realtimeTranscriptsCollection }) + .where(({ realtimeTranscript }) => + eq(realtimeTranscript.timelineKey, timeline.key) + ) + .orderBy(({ realtimeTranscript }) => realtimeTranscript.order) + .select(({ realtimeTranscript }) => ({ + key: realtimeTranscript.key, + order: realtimeTranscript.order, + session_id: realtimeTranscript.session_id, + direction: realtimeTranscript.direction, + text: realtimeTranscript.text, + status: realtimeTranscript.status, + turn_id: realtimeTranscript.turn_id, + response_id: realtimeTranscript.response_id, + audio_stream: realtimeTranscript.audio_stream, + audio_offset: realtimeTranscript.audio_offset, + audio_next_offset: realtimeTranscript.audio_next_offset, + sample_start: realtimeTranscript.sample_start, + sample_end: realtimeTranscript.sample_end, + created_at: realtimeTranscript.created_at, + })) + ), entities: toArray( q .from({ entity: entitiesCollection }) diff --git a/packages/agents-runtime/src/index.ts b/packages/agents-runtime/src/index.ts index 3275e31be3..878092471c 100644 --- a/packages/agents-runtime/src/index.ts +++ b/packages/agents-runtime/src/index.ts @@ -9,8 +9,30 @@ export type { ManifestContextEntry, ManifestEntry, ManifestEffectEntry, + ManifestRealtimeSessionEntry, ManifestSourceEntry, ManifestSharedStateEntry, + RealtimeAudioSpan, + RealtimeAudioConfig, + RealtimeAudioFormat, + RealtimeConfig, + RealtimeContextConfig, + RealtimeHandle, + RealtimeHelpers, + RealtimeProviderConfig, + RealtimeProviderConnectInput, + RealtimeProviderEvent, + RealtimeProviderSession, + RealtimeRunResult, + RealtimeSession, + RealtimeSessionPolicy, + RealtimeSessionStatus, + RealtimeSessionStreamRefs, + RealtimeToolPolicy, + RealtimeToolResult, + RealtimeTranscript, + RealtimeTranscriptEvent, + RealtimeTurnDetectionConfig, PendingSend, EffectConfig, ObservationSource, @@ -113,11 +135,39 @@ export type { AttachmentSubject, AttachmentSubjectType, ManifestContextEntry as ManifestContextEntryRow, + ManifestRealtimeSessionEntry as ManifestRealtimeSessionEntryRow, + RealtimeAudioSpan as RealtimeAudioSpanRow, + RealtimeSession as RealtimeSessionRow, + RealtimeSessionStatus as RealtimeSessionStatusRow, + RealtimeSessionStreamRefs as RealtimeSessionStreamRefsRow, + RealtimeTranscript as RealtimeTranscriptRow, ReplayWatermark, WakeConfigValue, } from './entity-schema' export { createEntityStreamDB } from './entity-stream-db' +export { createTestRealtimeProvider } from './realtime' +export type { TestRealtimeProviderOptions } from './realtime' +export { createOpenAIRealtimeProvider } from './openai-realtime' +export type { OpenAIRealtimeProviderOptions } from './openai-realtime' +export { + DEFAULT_OPENAI_REALTIME_MODEL, + DEFAULT_OPENAI_REALTIME_REASONING_EFFORT, + DEFAULT_OPENAI_REALTIME_VOICE, + OPENAI_REALTIME_MODELS, + OPENAI_REALTIME_REASONING_EFFORTS, + OPENAI_REALTIME_VOICES, + isOpenAIRealtimeModel, + isOpenAIRealtimeReasoningEffort, + isOpenAIRealtimeVoice, +} from './realtime-options' +export type { + OpenAIRealtimeReasoningEffort, + RealtimeModelChoice, + RealtimeProviderId, + RealtimeReasoningEffortChoice, + RealtimeVoiceChoice, +} from './realtime-options' export { getEntityAttachmentStreamPath, manifestAttachmentKey, @@ -250,6 +300,9 @@ export type { DispatchPolicy, SpawnEntityOptions, SendEntityMessageOptions, + RealtimeAudioOptions, + RealtimeSessionStartResult, + StartRealtimeSessionOptions, } from './runtime-server-client' export { buildEventSourceManifestEntry, diff --git a/packages/agents-runtime/src/openai-realtime.ts b/packages/agents-runtime/src/openai-realtime.ts new file mode 100644 index 0000000000..b72f5cc3ed --- /dev/null +++ b/packages/agents-runtime/src/openai-realtime.ts @@ -0,0 +1,908 @@ +import type { + AgentTool, + LLMMessage, + RealtimeAudioFormat, + RealtimeProviderConfig, + RealtimeProviderConnectInput, + RealtimeProviderEvent, + RealtimeProviderSession, + RealtimeToolResult, + RealtimeTurnDetectionConfig, +} from './types' +import { + DEFAULT_OPENAI_REALTIME_MODEL, + DEFAULT_OPENAI_REALTIME_REASONING_EFFORT, + type OpenAIRealtimeReasoningEffort, +} from './realtime-options' + +type MaybePromise = T | Promise +type OpenAIRealtimeSocket = { + send: (data: string) => void + close?: (code?: number, reason?: string) => void + addEventListener?: ( + event: string, + handler: (...args: Array) => void + ) => void + removeEventListener?: ( + event: string, + handler: (...args: Array) => void + ) => void + on?: (event: string, handler: (...args: Array) => void) => void + off?: (event: string, handler: (...args: Array) => void) => void + readyState?: number +} +type OpenAIRealtimeWebSocketConstructor = new ( + url: string, + init?: unknown +) => OpenAIRealtimeSocket + +const DEFAULT_OPENAI_INPUT_TRANSCRIPTION_MODEL = `gpt-4o-mini-transcribe` +const BYTES_PER_PCM16_SAMPLE = 2 +const MAX_INPUT_AUDIO_APPEND_BYTES = 32 * 1024 + +export interface OpenAIRealtimeProviderOptions { + apiKey: string | (() => MaybePromise) + model?: string + url?: string + voice?: string + reasoningEffort?: OpenAIRealtimeReasoningEffort + safetyIdentifier?: string + headers?: Record + WebSocket?: OpenAIRealtimeWebSocketConstructor +} + +type OpenAIRealtimeEvent = Record & { type?: string } + +class AsyncEventQueue implements AsyncIterable { + private values: Array = [] + private resolvers: Array<{ + resolve: (value: IteratorResult) => void + reject: (error: unknown) => void + }> = [] + private closed = false + private error: unknown + + push(value: T): void { + if (this.closed) return + const resolver = this.resolvers.shift() + if (resolver) { + resolver.resolve({ value, done: false }) + return + } + this.values.push(value) + } + + close(): void { + if (this.closed) return + this.closed = true + for (const resolver of this.resolvers.splice(0)) { + resolver.resolve({ value: undefined as T, done: true }) + } + } + + fail(error: unknown): void { + if (this.closed) return + this.error = error + this.closed = true + for (const resolver of this.resolvers.splice(0)) { + resolver.reject(error) + } + } + + [Symbol.asyncIterator](): AsyncIterator { + return { + next: () => { + if (this.values.length > 0) { + return Promise.resolve({ value: this.values.shift()!, done: false }) + } + if (this.error) { + return Promise.reject(this.error) + } + if (this.closed) { + return Promise.resolve({ value: undefined as T, done: true }) + } + return new Promise>((resolve, reject) => { + this.resolvers.push({ resolve, reject }) + }) + }, + } + } +} + +function resolveWebSocket( + opts: OpenAIRealtimeProviderOptions +): OpenAIRealtimeWebSocketConstructor { + const ctor = opts.WebSocket ?? globalThis.WebSocket + if (!ctor) { + throw new Error( + `[agent-runtime] OpenAI realtime requires a WebSocket implementation` + ) + } + return ctor as unknown as OpenAIRealtimeWebSocketConstructor +} + +function onSocket( + ws: OpenAIRealtimeSocket, + event: string, + handler: (...args: Array) => void +): void { + if (ws.addEventListener) { + ws.addEventListener(event, handler) + return + } + ws.on?.(event, handler) +} + +function socketMessageData(args: Array): unknown { + const [first] = args + if (first && typeof first === `object` && `data` in first) { + return (first as { data: unknown }).data + } + return first +} + +function socketCloseDetails(args: Array): { + code?: number + reason?: string + wasClean?: boolean +} { + const [first, second] = args + if (typeof first === `number`) { + return { + code: first, + reason: second === undefined ? undefined : dataToString(second), + } + } + if (!first || typeof first !== `object`) return {} + const event = first as { + code?: unknown + reason?: unknown + wasClean?: unknown + } + return { + code: typeof event.code === `number` ? event.code : undefined, + reason: + typeof event.reason === `string` + ? event.reason + : event.reason === undefined + ? undefined + : dataToString(event.reason), + wasClean: typeof event.wasClean === `boolean` ? event.wasClean : undefined, + } +} + +function socketCloseError(details: { + code?: number + reason?: string + wasClean?: boolean +}): string { + const parts = [`OpenAI realtime WebSocket closed before client stop`] + if (details.code !== undefined) parts.push(`code=${details.code}`) + if (details.reason) parts.push(`reason=${details.reason}`) + if (details.wasClean !== undefined) parts.push(`clean=${details.wasClean}`) + return parts.join(` `) +} + +function dataToString(data: unknown): string { + if (typeof data === `string`) return data + if (data instanceof ArrayBuffer) return new TextDecoder().decode(data) + if (data instanceof Uint8Array) return new TextDecoder().decode(data) + if ( + data && + typeof data === `object` && + `toString` in data && + typeof data.toString === `function` + ) { + return data.toString() + } + return String(data) +} + +function bytesToBase64(bytes: Uint8Array): string { + const bufferCtor = (globalThis as { Buffer?: typeof Buffer }).Buffer + if (bufferCtor) return bufferCtor.from(bytes).toString(`base64`) + let binary = `` + for (const byte of bytes) binary += String.fromCharCode(byte) + return btoa(binary) +} + +function alignedPcm16Bytes(bytes: Uint8Array): Uint8Array { + const alignedLength = + bytes.byteLength - (bytes.byteLength % BYTES_PER_PCM16_SAMPLE) + if (alignedLength <= 0) return new Uint8Array() + return alignedLength === bytes.byteLength + ? bytes + : bytes.subarray(0, alignedLength) +} + +function inputAudioAppendChunks(bytes: Uint8Array): Array { + const aligned = alignedPcm16Bytes(bytes) + if (aligned.byteLength === 0) return [] + if (aligned.byteLength <= MAX_INPUT_AUDIO_APPEND_BYTES) return [aligned] + + const chunks: Array = [] + const chunkSize = + MAX_INPUT_AUDIO_APPEND_BYTES - + (MAX_INPUT_AUDIO_APPEND_BYTES % BYTES_PER_PCM16_SAMPLE) + for (let offset = 0; offset < aligned.byteLength; offset += chunkSize) { + chunks.push(aligned.subarray(offset, offset + chunkSize)) + } + return chunks +} + +function base64ToBytes(value: string): Uint8Array { + const bufferCtor = (globalThis as { Buffer?: typeof Buffer }).Buffer + if (bufferCtor) return new Uint8Array(bufferCtor.from(value, `base64`)) + const binary = atob(value) + const bytes = new Uint8Array(binary.length) + for (let index = 0; index < binary.length; index += 1) { + bytes[index] = binary.charCodeAt(index) + } + return bytes +} + +function sendJson(ws: OpenAIRealtimeSocket, event: unknown): void { + ws.send(JSON.stringify(event)) +} + +function toolName(tool: AgentTool): string { + return tool.name +} + +function toOpenAITool(tool: AgentTool): Record { + return { + type: `function`, + name: tool.name, + description: tool.description, + parameters: tool.parameters, + } +} + +function messageContentText(content: unknown): string { + if (typeof content === `string`) return content + if (!Array.isArray(content)) return `` + return content + .map((part) => { + if (!part || typeof part !== `object`) return `` + const text = (part as { text?: unknown }).text + return typeof text === `string` ? text : `` + }) + .filter(Boolean) + .join(`\n`) +} + +function messageRole(message: LLMMessage): `user` | `assistant` | null { + const role = (message as { role?: unknown }).role + return role === `assistant` ? `assistant` : role === `user` ? `user` : null +} + +function sendConversationMessage( + ws: OpenAIRealtimeSocket, + message: LLMMessage +): void { + const role = messageRole(message) + if (!role) return + const text = messageContentText((message as { content?: unknown }).content) + if (!text) return + sendJson(ws, { + type: `conversation.item.create`, + item: { + type: `message`, + role, + content: [ + { + type: role === `assistant` ? `output_text` : `input_text`, + text, + }, + ], + }, + }) +} + +function realtimeFormat( + format: RealtimeAudioFormat | undefined +): Record | undefined { + if (!format) return undefined + return { + type: `audio/pcm`, + rate: format.sampleRate, + } +} + +function inputTranscription( + input: RealtimeProviderConnectInput +): Record | undefined { + if (!input.audio?.inputFormat || input.audio.inputTranscription === false) { + return undefined + } + const config = input.audio.inputTranscription ?? {} + return { + model: config.model ?? DEFAULT_OPENAI_INPUT_TRANSCRIPTION_MODEL, + ...(config.language ? { language: config.language } : {}), + ...(config.prompt ? { prompt: config.prompt } : {}), + ...(config.delay ? { delay: config.delay } : {}), + } +} + +function realtimeTurnDetection( + config: RealtimeTurnDetectionConfig | undefined +): Record | null { + if (config === false || config?.type === `none`) return null + if (!config) { + return { + type: `server_vad`, + threshold: 0.55, + prefix_padding_ms: 300, + silence_duration_ms: 500, + create_response: true, + interrupt_response: true, + } + } + if (config.type === `semantic_vad`) { + return { + type: `semantic_vad`, + ...(config.eagerness ? { eagerness: config.eagerness } : {}), + create_response: config.createResponse ?? true, + interrupt_response: config.interruptResponse ?? true, + } + } + return { + type: `server_vad`, + ...(config.threshold != null ? { threshold: config.threshold } : {}), + ...(config.prefixPaddingMs != null + ? { prefix_padding_ms: config.prefixPaddingMs } + : {}), + ...(config.silenceDurationMs != null + ? { silence_duration_ms: config.silenceDurationMs } + : {}), + create_response: config.createResponse ?? true, + interrupt_response: config.interruptResponse ?? true, + } +} + +function buildSessionUpdate( + opts: OpenAIRealtimeProviderOptions, + input: RealtimeProviderConnectInput +): Record { + const inputFormat = realtimeFormat(input.audio?.inputFormat) + const outputFormat = realtimeFormat(input.audio?.outputFormat) + const transcription = inputTranscription(input) + const model = opts.model ?? DEFAULT_OPENAI_REALTIME_MODEL + const wantsAudioOutput = Boolean(outputFormat || opts.voice) + const reasoningEffort = + model === DEFAULT_OPENAI_REALTIME_MODEL + ? (opts.reasoningEffort ?? DEFAULT_OPENAI_REALTIME_REASONING_EFFORT) + : undefined + return { + type: `session.update`, + session: { + type: `realtime`, + model, + instructions: input.systemPrompt, + output_modalities: wantsAudioOutput ? [`audio`] : [`text`], + tool_choice: input.tools.length > 0 ? `auto` : `none`, + ...(reasoningEffort ? { reasoning: { effort: reasoningEffort } } : {}), + ...(input.tools.length > 0 + ? { tools: input.tools.map((tool) => toOpenAITool(tool)) } + : {}), + ...(inputFormat || wantsAudioOutput + ? { + audio: { + ...(inputFormat + ? { + input: { + format: inputFormat, + ...(transcription ? { transcription } : {}), + turn_detection: realtimeTurnDetection( + input.audio?.turnDetection + ), + }, + } + : {}), + ...(wantsAudioOutput + ? { + output: { + ...(outputFormat ? { format: outputFormat } : {}), + ...(opts.voice ? { voice: opts.voice } : {}), + }, + } + : {}), + }, + } + : {}), + }, + } +} + +function parseToolArgs(value: unknown): unknown { + if (typeof value !== `string`) return value ?? {} + try { + return JSON.parse(value) as unknown + } catch { + return value + } +} + +function toolResultOutput(result: RealtimeToolResult): string { + if (typeof result.result === `string`) return result.result + return JSON.stringify(result.result) +} + +type OutputTranscriptSource = + | `response.audio_transcript` + | `response.output_audio_transcript` + | `response.output_text` + +function outputTranscriptSource( + event: OpenAIRealtimeEvent +): OutputTranscriptSource | undefined { + if ( + event.type === `response.audio_transcript.delta` || + event.type === `response.audio_transcript.done` + ) { + return `response.audio_transcript` + } + if ( + event.type === `response.output_audio_transcript.delta` || + event.type === `response.output_audio_transcript.done` + ) { + return `response.output_audio_transcript` + } + if ( + event.type === `response.output_text.delta` || + event.type === `response.output_text.done` + ) { + return `response.output_text` + } + return undefined +} + +function openAIString(value: unknown): string | undefined { + return typeof value === `string` ? value : undefined +} + +function openAINumber(value: unknown): number | undefined { + return typeof value === `number` && Number.isFinite(value) ? value : undefined +} + +function mapOpenAIEvent( + event: OpenAIRealtimeEvent +): Array { + switch (event.type) { + case `session.created`: + return [{ type: `session.started`, sessionId: event.session?.id }] + case `session.updated`: + return [{ type: `session.updated` }] + case `error`: + return [ + { + type: `session.error`, + error: + typeof event.error?.message === `string` + ? event.error.message + : `OpenAI realtime error`, + code: + typeof event.error?.code === `string` + ? event.error.code + : undefined, + }, + ] + case `input_audio_buffer.speech_started`: + return [ + { + type: `input_audio.speech_started`, + audioOffset: + typeof event.audio_start_ms === `number` + ? String(event.audio_start_ms) + : undefined, + turnId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `input_audio_buffer.speech_stopped`: + return [ + { + type: `input_audio.speech_stopped`, + audioOffset: + typeof event.audio_end_ms === `number` + ? String(event.audio_end_ms) + : undefined, + turnId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `input_audio_buffer.committed`: + return [ + { + type: `input_audio.committed`, + turnId: openAIString(event.item_id), + previousTurnId: openAIString(event.previous_item_id), + }, + ] + case `conversation.item.input_audio_transcription.delta`: + return [ + { + type: `input_transcript.delta`, + delta: String(event.delta ?? ``), + turnId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `conversation.item.input_audio_transcription.completed`: + return [ + { + type: `input_transcript.completed`, + text: String(event.transcript ?? ``), + turnId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `response.created`: + return [ + { + type: `response.started`, + responseId: + typeof event.response?.id === `string` + ? event.response.id + : undefined, + }, + ] + case `response.audio.delta`: + case `response.output_audio.delta`: + return [ + { + type: `output_audio.delta`, + audio: base64ToBytes(String(event.delta ?? ``)), + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + itemId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `response.audio.done`: + case `response.output_audio.done`: + return [ + { + type: `output_audio.completed`, + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + itemId: typeof event.item_id === `string` ? event.item_id : undefined, + }, + ] + case `response.audio_transcript.delta`: + case `response.output_audio_transcript.delta`: + case `response.output_text.delta`: + return [ + { + type: `output_transcript.delta`, + delta: String(event.delta ?? ``), + responseId: openAIString(event.response_id), + itemId: openAIString(event.item_id), + contentIndex: openAINumber(event.content_index), + transcriptSource: outputTranscriptSource(event), + }, + ] + case `response.audio_transcript.done`: + case `response.output_audio_transcript.done`: + case `response.output_text.done`: + return [ + { + type: `output_transcript.completed`, + text: + typeof event.transcript === `string` + ? event.transcript + : typeof event.text === `string` + ? event.text + : undefined, + responseId: openAIString(event.response_id), + itemId: openAIString(event.item_id), + contentIndex: openAINumber(event.content_index), + transcriptSource: outputTranscriptSource(event), + }, + ] + case `response.done`: + return [ + { + type: `response.completed`, + responseId: + typeof event.response?.id === `string` + ? event.response.id + : typeof event.response_id === `string` + ? event.response_id + : undefined, + }, + ] + case `response.cancelled`: + return [ + { + type: `response.cancelled`, + responseId: + typeof event.response_id === `string` + ? event.response_id + : undefined, + }, + ] + case `response.output_item.added`: + if (event.item?.type !== `function_call`) return [] + return [ + { + type: `tool_call.started`, + toolCallId: String(event.item.call_id ?? event.item.id ?? ``), + name: String(event.item.name ?? ``), + }, + ] + case `response.function_call_arguments.delta`: + return [ + { + type: `tool_call.arguments_delta`, + toolCallId: String(event.call_id ?? event.item_id ?? ``), + delta: String(event.delta ?? ``), + }, + ] + default: + return [] + } +} + +export function createOpenAIRealtimeProvider( + opts: OpenAIRealtimeProviderOptions +): RealtimeProviderConfig { + const model = opts.model ?? DEFAULT_OPENAI_REALTIME_MODEL + + return { + id: `openai`, + model, + async connect(input): Promise { + const apiKey = + typeof opts.apiKey === `function` ? await opts.apiKey() : opts.apiKey + if (!apiKey) { + throw new Error(`[agent-runtime] OpenAI realtime apiKey is required`) + } + + const WebSocketCtor = resolveWebSocket(opts) + const url = new URL(opts.url ?? `wss://api.openai.com/v1/realtime`) + url.searchParams.set(`model`, model) + const headers: Record = { + Authorization: `Bearer ${apiKey}`, + ...opts.headers, + } + if (opts.safetyIdentifier) { + headers[`OpenAI-Safety-Identifier`] = opts.safetyIdentifier + } + + const ws = new WebSocketCtor(url.toString(), { headers }) + const queue = new AsyncEventQueue() + const toolsByName = new Map( + input.tools.map((tool) => [toolName(tool), tool]) + ) + const seenProviderEventIds = new Set() + let socketOpen = false + let socketClosed = false + let clientCloseRequested = false + let responseEpoch = 0 + let rejectOpen: ((error: Error) => void) | undefined + + const closeQueue = (reason?: string): void => { + if (socketClosed) return + socketClosed = true + queue.push({ type: `session.closed`, reason }) + queue.close() + input.signal?.removeEventListener(`abort`, handleAbort) + } + + const handleAbort = (): void => { + const error = new Error( + `[agent-runtime] OpenAI realtime WebSocket aborted` + ) + clientCloseRequested = true + closeQueue(`aborted`) + ws.close?.(1000, `aborted`) + if (!socketOpen) rejectOpen?.(error) + } + + const sendToolResult = async ( + result: RealtimeToolResult + ): Promise => { + sendJson(ws, { + type: `conversation.item.create`, + item: { + type: `function_call_output`, + call_id: result.toolCallId, + output: toolResultOutput(result), + }, + }) + sendJson(ws, { type: `response.create` }) + } + + const executeToolCall = async ( + event: OpenAIRealtimeEvent + ): Promise => { + const toolResponseEpoch = responseEpoch + const item = event.item ?? {} + const toolCallId = String( + event.call_id ?? item.call_id ?? item.id ?? event.item_id ?? `` + ) + const name = String(event.name ?? item.name ?? ``) + const args = parseToolArgs(event.arguments ?? item.arguments) + queue.push({ + type: `tool_call.arguments_completed`, + toolCallId, + name, + args, + }) + const tool = toolsByName.get(name) + if (!tool) { + const result: RealtimeToolResult = { + toolCallId, + name, + result: `Tool "${name}" is not available.`, + isError: true, + } + queue.push({ type: `tool_call.completed`, ...result }) + await sendToolResult(result) + return + } + + try { + const prepared = + typeof tool.prepareArguments === `function` + ? tool.prepareArguments(args) + : args + const result = await tool.execute( + toolCallId, + prepared as never, + input.signal + ) + const realtimeResult: RealtimeToolResult = { + toolCallId, + name, + result, + } + queue.push({ type: `tool_call.completed`, ...realtimeResult }) + if ( + clientCloseRequested || + socketClosed || + input.signal?.aborted || + toolResponseEpoch !== responseEpoch + ) { + return + } + await sendToolResult(realtimeResult) + } catch (error) { + const realtimeResult: RealtimeToolResult = { + toolCallId, + name, + result: error instanceof Error ? error.message : String(error), + isError: true, + } + queue.push({ type: `tool_call.completed`, ...realtimeResult }) + if ( + clientCloseRequested || + socketClosed || + input.signal?.aborted || + toolResponseEpoch !== responseEpoch + ) { + return + } + await sendToolResult(realtimeResult) + } + } + + const opened = new Promise((resolve, reject) => { + rejectOpen = reject + onSocket(ws, `open`, () => { + if (socketClosed) return + socketOpen = true + if (input.signal?.aborted) { + handleAbort() + return + } + resolve() + }) + onSocket(ws, `error`, (event) => { + const error = + event instanceof Error + ? event + : new Error(`[agent-runtime] OpenAI realtime WebSocket error`) + input.signal?.removeEventListener(`abort`, handleAbort) + queue.fail(error) + reject(error) + }) + }) + + onSocket(ws, `message`, (...args) => { + try { + const parsed = JSON.parse( + dataToString(socketMessageData(args)) + ) as OpenAIRealtimeEvent + if (typeof parsed.event_id === `string`) { + if (seenProviderEventIds.has(parsed.event_id)) return + seenProviderEventIds.add(parsed.event_id) + } + if (parsed.type === `response.created`) { + responseEpoch += 1 + } + if (parsed.type === `response.function_call_arguments.done`) { + void executeToolCall(parsed).catch((error) => queue.fail(error)) + return + } + for (const event of mapOpenAIEvent(parsed)) queue.push(event) + } catch (error) { + queue.fail(error) + } + }) + onSocket(ws, `close`, (...args) => { + const details = socketCloseDetails(args) + if (clientCloseRequested || input.signal?.aborted) { + closeQueue(details.reason || undefined) + return + } + queue.push({ + type: `session.error`, + code: `websocket_closed`, + error: socketCloseError(details), + }) + closeQueue(details.reason || `websocket_closed`) + }) + + if (input.signal?.aborted) { + handleAbort() + } else { + input.signal?.addEventListener(`abort`, handleAbort, { once: true }) + } + + await opened + sendJson(ws, buildSessionUpdate(opts, input)) + for (const message of input.messages) { + sendConversationMessage(ws, message) + } + + return { + events: queue, + appendInputAudio: async (chunk) => { + for (const appendChunk of inputAudioAppendChunks(chunk)) { + sendJson(ws, { + type: `input_audio_buffer.append`, + audio: bytesToBase64(appendChunk), + }) + } + }, + clearInputAudio: async () => { + sendJson(ws, { type: `input_audio_buffer.clear` }) + }, + commitInputAudio: async () => { + sendJson(ws, { type: `input_audio_buffer.commit` }) + sendJson(ws, { type: `response.create` }) + }, + sendText: async (text) => { + sendJson(ws, { + type: `conversation.item.create`, + item: { + type: `message`, + role: `user`, + content: [{ type: `input_text`, text }], + }, + }) + sendJson(ws, { type: `response.create` }) + }, + sendToolResult, + cancelResponse: async () => { + responseEpoch += 1 + sendJson(ws, { type: `response.cancel` }) + }, + truncateOutputAudio: async ({ itemId, audioEndMs }) => { + sendJson(ws, { + type: `conversation.item.truncate`, + item_id: itemId, + content_index: 0, + audio_end_ms: audioEndMs, + }) + }, + close: async (reason) => { + clientCloseRequested = true + closeQueue(reason) + ws.close?.(1000, reason) + }, + } + }, + } +} diff --git a/packages/agents-runtime/src/process-wake.ts b/packages/agents-runtime/src/process-wake.ts index 623f914b20..dc51203dd3 100644 --- a/packages/agents-runtime/src/process-wake.ts +++ b/packages/agents-runtime/src/process-wake.ts @@ -2098,6 +2098,10 @@ export async function processWake( activeSignalHandler = handler }, hydratedEventSourceWake: await hydrateCurrentEventSourceWake(), + realtimeStreams: { + baseUrl, + headers: serverHeaders, + }, doObserve, doSpawn, doFork, diff --git a/packages/agents-runtime/src/realtime-options.ts b/packages/agents-runtime/src/realtime-options.ts new file mode 100644 index 0000000000..49e138cd48 --- /dev/null +++ b/packages/agents-runtime/src/realtime-options.ts @@ -0,0 +1,144 @@ +export type RealtimeProviderId = `openai` + +export type RealtimeModelChoice = { + id: string + label: string + description: string + recommended?: boolean +} + +export type RealtimeVoiceChoice = { + id: string + label: string + description: string + recommended?: boolean +} + +export type OpenAIRealtimeReasoningEffort = `low` | `medium` | `high` + +export type RealtimeReasoningEffortChoice = { + id: OpenAIRealtimeReasoningEffort + label: string + description: string + recommended?: boolean +} + +export const DEFAULT_OPENAI_REALTIME_MODEL = `gpt-realtime-2` +export const DEFAULT_OPENAI_REALTIME_VOICE = `marin` +export const DEFAULT_OPENAI_REALTIME_REASONING_EFFORT: OpenAIRealtimeReasoningEffort = `low` + +export const OPENAI_REALTIME_MODELS = [ + { + id: `gpt-realtime-2`, + label: `GPT-Realtime-2`, + description: `Strongest realtime reasoning, tool use, and instruction following.`, + recommended: true, + }, + { + id: `gpt-realtime-1.5`, + label: `GPT-Realtime-1.5`, + description: `Fast, reliable speech-to-speech model for audio in, audio out.`, + }, + { + id: `gpt-realtime-mini`, + label: `GPT-Realtime mini`, + description: `Cost-efficient realtime voice model.`, + }, +] as const satisfies ReadonlyArray + +export const OPENAI_REALTIME_VOICES = [ + { + id: `marin`, + label: `Marin`, + description: `OpenAI recommended voice with the strongest naturalness.`, + recommended: true, + }, + { + id: `cedar`, + label: `Cedar`, + description: `OpenAI recommended voice with a distinct, expressive tone.`, + recommended: true, + }, + { + id: `alloy`, + label: `Alloy`, + description: `Balanced general-purpose voice.`, + }, + { + id: `ash`, + label: `Ash`, + description: `Clear general-purpose voice.`, + }, + { + id: `ballad`, + label: `Ballad`, + description: `Warm general-purpose voice.`, + }, + { + id: `coral`, + label: `Coral`, + description: `Bright general-purpose voice.`, + }, + { + id: `echo`, + label: `Echo`, + description: `Steady general-purpose voice.`, + }, + { + id: `sage`, + label: `Sage`, + description: `Calm general-purpose voice.`, + }, + { + id: `shimmer`, + label: `Shimmer`, + description: `Light general-purpose voice.`, + }, + { + id: `verse`, + label: `Verse`, + description: `Expressive general-purpose voice.`, + }, +] as const satisfies ReadonlyArray + +export const OPENAI_REALTIME_REASONING_EFFORTS = [ + { + id: `low`, + label: `Low`, + description: `Lowest recommended latency for production voice agents.`, + recommended: true, + }, + { + id: `medium`, + label: `Medium`, + description: `More reasoning for harder requests, with higher latency.`, + }, + { + id: `high`, + label: `High`, + description: `Deepest reasoning; use only when latency is acceptable.`, + }, +] as const satisfies ReadonlyArray + +export function isOpenAIRealtimeModel(value: unknown): value is string { + return ( + typeof value === `string` && + OPENAI_REALTIME_MODELS.some((model) => model.id === value) + ) +} + +export function isOpenAIRealtimeVoice(value: unknown): value is string { + return ( + typeof value === `string` && + OPENAI_REALTIME_VOICES.some((voice) => voice.id === value) + ) +} + +export function isOpenAIRealtimeReasoningEffort( + value: unknown +): value is OpenAIRealtimeReasoningEffort { + return ( + typeof value === `string` && + OPENAI_REALTIME_REASONING_EFFORTS.some((effort) => effort.id === value) + ) +} diff --git a/packages/agents-runtime/src/realtime.ts b/packages/agents-runtime/src/realtime.ts new file mode 100644 index 0000000000..5916d4ebd6 --- /dev/null +++ b/packages/agents-runtime/src/realtime.ts @@ -0,0 +1,42 @@ +import type { RealtimeProviderConfig, RealtimeProviderEvent } from './types' + +export interface TestRealtimeProviderOptions { + model?: string + events?: Array + response?: string +} + +export function createTestRealtimeProvider( + opts: TestRealtimeProviderOptions = {} +): RealtimeProviderConfig { + return { + id: `test`, + model: opts.model ?? `test-realtime`, + async connect() { + const events = + opts.events ?? + (opts.response != null + ? [ + { type: `session.started` as const }, + { + type: `output_transcript.completed` as const, + text: opts.response, + }, + { type: `response.completed` as const }, + { type: `session.closed` as const }, + ] + : [ + { type: `session.started` as const }, + { type: `session.closed` as const }, + ]) + + return { + events: (async function* () { + for (const event of events) { + yield event + } + })(), + } + }, + } +} diff --git a/packages/agents-runtime/src/runtime-server-client.ts b/packages/agents-runtime/src/runtime-server-client.ts index 68134257d5..e90c6ff7f4 100644 --- a/packages/agents-runtime/src/runtime-server-client.ts +++ b/packages/agents-runtime/src/runtime-server-client.ts @@ -8,6 +8,8 @@ import type { ManifestAttachmentEntry, } from './types' import type { EntitySignal } from './entity-schema' +import type { RealtimeSessionStreamRefs } from './entity-schema' +import type { OpenAIRealtimeReasoningEffort } from './realtime-options' import type { EventSourceContract, EventSourceSubscription, @@ -91,6 +93,38 @@ export interface SendEntityMessageOptions { writeToken?: string } +export interface RealtimeAudioOptions { + codec?: `pcm16` + sampleRate?: number + channels?: number +} + +export interface StartRealtimeSessionOptions { + entityUrl: string + id?: string + provider: string + model: string + voice?: string + reasoningEffort?: OpenAIRealtimeReasoningEffort + interruptResponse?: boolean + inputAudio?: RealtimeAudioOptions + outputAudio?: RealtimeAudioOptions + meta?: Record +} + +export interface RealtimeSessionStartResult { + sessionId: string + entityUrl: string + provider: string + model: string + voice?: string + reasoningEffort?: OpenAIRealtimeReasoningEffort + interruptResponse?: boolean + status: `requested` + startedAt: string + streams: RealtimeSessionStreamRefs +} + export interface RegisterWakeOptions { subscriberUrl: string sourceUrl: string @@ -116,6 +150,9 @@ export interface SignalEntityOptions { export interface RuntimeServerClient { sendEntityMessage: (options: SendEntityMessageOptions) => Promise + startRealtimeSession: ( + options: StartRealtimeSessionOptions + ) => Promise createAttachment: (options: { entityUrl: string attachment: AttachmentCreateInput @@ -358,6 +395,24 @@ export function createRuntimeServerClient( } } + const startRealtimeSession = async ( + options: StartRealtimeSessionOptions + ): Promise => { + const response = await request(`/_electric/realtime/sessions`, { + method: `POST`, + headers: { 'content-type': `application/json` }, + body: JSON.stringify(options), + }) + + if (!response.ok) { + throw new Error( + `startRealtimeSession ${options.entityUrl} failed (${response.status}): ${await readErrorText(response)}` + ) + } + + return (await response.json()) as RealtimeSessionStartResult + } + const createAttachment = async ({ entityUrl, attachment, @@ -875,6 +930,7 @@ export function createRuntimeServerClient( return { sendEntityMessage, + startRealtimeSession, createAttachment, readAttachment, spawnEntity, diff --git a/packages/agents-runtime/src/timeline-context.ts b/packages/agents-runtime/src/timeline-context.ts index 461430da4a..39c8030c3f 100644 --- a/packages/agents-runtime/src/timeline-context.ts +++ b/packages/agents-runtime/src/timeline-context.ts @@ -7,6 +7,7 @@ import type { IncludesContextInserted, IncludesContextRemoved, IncludesInboxMessage, + IncludesRealtimeTranscript, IncludesRun, IncludesSignal, IncludesWakeMessage, @@ -69,12 +70,14 @@ export function buildTimelineMessages(input: { inbox: Array wakes?: Array signals?: Array + realtimeTranscripts?: Array }): Array { return materializeTimeline({ runs: input.runs, inbox: input.inbox, wakes: input.wakes ?? [], signals: input.signals ?? [], + realtimeTranscripts: input.realtimeTranscripts ?? [], contextInserted: [], contextRemoved: [], entities: [], @@ -194,6 +197,21 @@ function renderSignalMessage(signal: Signal): LLMMessage { } } +function isRealtimeSessionWake(payload: unknown): boolean { + if (!payload || typeof payload !== `object`) return false + const changes = (payload as { changes?: unknown }).changes + if (!Array.isArray(changes)) return false + return changes.some((change) => { + if (!change || typeof change !== `object`) return false + const payload = (change as { payload?: unknown }).payload + return ( + !!payload && + typeof payload === `object` && + (payload as { type?: unknown }).type === `realtime_session.started` + ) + }) +} + export function defaultProjection( item: TimelineItem ): Array | null { @@ -202,11 +220,22 @@ export function defaultProjection( return [{ role: `user`, content: projectInboxPayload(item) }] case `wake`: + if (isRealtimeSessionWake(item.payload)) return null return [{ role: `user`, content: asString(item.payload) }] case `signal`: return [renderSignalMessage(item.signal)] + case `realtime_transcript`: + if (item.text.length === 0) return null + if (item.status !== `final`) return null + return [ + { + role: item.direction === `input` ? `user` : `assistant`, + content: item.text, + }, + ] + case `run`: { const messages: Array = [] @@ -341,6 +370,11 @@ export function materializeTimeline( | { kind: `inbox`; order: TimelineOrder; item: IncludesInboxMessage } | { kind: `wake`; order: TimelineOrder; item: IncludesWakeMessage } | { kind: `signal`; order: TimelineOrder; item: IncludesSignal } + | { + kind: `realtime_transcript` + order: TimelineOrder + item: IncludesRealtimeTranscript + } | { kind: `run`; order: TimelineOrder; item: IncludesRun } | { kind: `context_inserted` @@ -371,6 +405,13 @@ export function materializeTimeline( order: item.order, item, })), + ...(data.realtimeTranscripts ?? []) + .filter((item) => item.text.length > 0) + .map((item) => ({ + kind: `realtime_transcript` as const, + order: item.order, + item, + })), ...data.runs.map((item) => ({ kind: `run` as const, order: item.order, @@ -429,6 +470,17 @@ export function materializeTimeline( signal: entry.item, } + case `realtime_transcript`: + return { + kind: `realtime_transcript`, + at: orderToOffset(entry.order), + key: entry.item.key, + sessionId: entry.item.session_id, + direction: entry.item.direction, + text: entry.item.text, + status: entry.item.status, + } + case `run`: return materializeRunItem(entry.item) diff --git a/packages/agents-runtime/src/types.ts b/packages/agents-runtime/src/types.ts index ec366ab670..53f65fab0d 100644 --- a/packages/agents-runtime/src/types.ts +++ b/packages/agents-runtime/src/types.ts @@ -43,8 +43,14 @@ import type { ManifestCronScheduleEntry as EntityManifestCronScheduleEntry, ManifestEffectEntry as EntityManifestEffectEntry, ManifestFutureSendScheduleEntry as EntityManifestFutureSendScheduleEntry, + ManifestRealtimeSessionEntry as EntityManifestRealtimeSessionEntry, ManifestSharedStateEntry as EntityManifestSharedStateEntry, ManifestSourceEntry as EntityManifestSourceEntry, + RealtimeAudioSpan as EntityRealtimeAudioSpan, + RealtimeSession as EntityRealtimeSession, + RealtimeSessionStatus as EntityRealtimeSessionStatus, + RealtimeSessionStreamRefs as EntityRealtimeSessionStreamRefs, + RealtimeTranscript as EntityRealtimeTranscript, Signal as EntitySignalEntry, WakeEntry, } from './entity-schema' @@ -321,8 +327,14 @@ export type ManifestCronScheduleEntry = EntityManifestCronScheduleEntry export type ManifestEffectEntry = EntityManifestEffectEntry export type ManifestFutureSendScheduleEntry = EntityManifestFutureSendScheduleEntry +export type ManifestRealtimeSessionEntry = EntityManifestRealtimeSessionEntry export type ManifestSourceEntry = EntityManifestSourceEntry export type ManifestSharedStateEntry = EntityManifestSharedStateEntry +export type RealtimeSession = EntityRealtimeSession +export type RealtimeSessionStatus = EntityRealtimeSessionStatus +export type RealtimeSessionStreamRefs = EntityRealtimeSessionStreamRefs +export type RealtimeAudioSpan = EntityRealtimeAudioSpan +export type RealtimeTranscript = EntityRealtimeTranscript export type ContextInserted = EntityContextInserted export type ContextRemoved = EntityContextRemoved export type ContextEntryAttrs = EntityContextEntryAttrs @@ -370,6 +382,15 @@ export type TimelineItem = } | { kind: `wake`; at: number; payload: unknown } | { kind: `signal`; at: number; signal: EntitySignalEntry } + | { + kind: `realtime_transcript` + at: number + key: string + sessionId: string + direction: `input` | `output` + text: string + status: `partial` | `final` + } | { kind: `run` at: number @@ -919,6 +940,220 @@ export interface AgentConfig { testResponses?: TestResponses } +export type RealtimeAudioCodec = `pcm16` + +export interface RealtimeAudioFormat { + codec: RealtimeAudioCodec + sampleRate: number + channels: number +} + +export interface RealtimeInputTranscriptionConfig { + model?: string + language?: string + prompt?: string + delay?: `minimal` | `low` | `medium` | `high` | `xhigh` +} + +export type RealtimeTurnDetectionConfig = + | false + | { type: `none` } + | { + type: `server_vad` + threshold?: number + prefixPaddingMs?: number + silenceDurationMs?: number + createResponse?: boolean + interruptResponse?: boolean + } + | { + type: `semantic_vad` + eagerness?: `low` | `medium` | `high` | `auto` + createResponse?: boolean + interruptResponse?: boolean + } + +export interface RealtimeAudioConfig { + inputFormat?: RealtimeAudioFormat + outputFormat?: RealtimeAudioFormat + inputTranscription?: false | RealtimeInputTranscriptionConfig + turnDetection?: RealtimeTurnDetectionConfig +} + +export interface RealtimeToolPolicy { + direct?: Array + confirm?: Array + delegate?: Array +} + +export interface RealtimeSessionPolicy { + textDuringSession?: `route-to-realtime` + retention?: `forever` +} + +export interface RealtimeContextConfig { + includeTimeline?: boolean +} + +export type RealtimeProviderEvent = + | { type: `session.started`; sessionId?: string } + | { type: `session.updated` } + | { type: `session.closed`; reason?: string } + | { type: `session.error`; error: string; code?: string } + | { + type: `input_audio.speech_started` + audioOffset?: string + turnId?: string + } + | { + type: `input_audio.speech_stopped` + audioOffset?: string + turnId?: string + } + | { + type: `input_audio.committed` + turnId?: string + previousTurnId?: string + } + | { type: `input_transcript.delta`; delta: string; turnId?: string } + | { type: `input_transcript.completed`; text: string; turnId?: string } + | { + type: `output_audio.delta` + audio: Uint8Array + responseId?: string + itemId?: string + } + | { type: `output_audio.completed`; responseId?: string; itemId?: string } + | { + type: `output_transcript.delta` + delta: string + responseId?: string + itemId?: string + contentIndex?: number + transcriptSource?: + | `response.audio_transcript` + | `response.output_audio_transcript` + | `response.output_text` + } + | { + type: `output_transcript.completed` + text?: string + responseId?: string + itemId?: string + contentIndex?: number + transcriptSource?: + | `response.audio_transcript` + | `response.output_audio_transcript` + | `response.output_text` + } + | { type: `response.started`; responseId?: string } + | { type: `response.completed`; responseId?: string } + | { type: `response.cancelled`; responseId?: string } + | { + type: `tool_call.started` + toolCallId: string + name: string + args?: unknown + } + | { + type: `tool_call.arguments_delta` + toolCallId: string + delta: string + } + | { + type: `tool_call.arguments_completed` + toolCallId: string + name: string + args: unknown + } + | { + type: `tool_call.completed` + toolCallId: string + name: string + result: unknown + isError?: boolean + } + +export interface RealtimeProviderConnectInput { + systemPrompt: string + messages: Array + tools: Array + audio?: RealtimeAudioConfig + session?: ManifestRealtimeSessionEntry + signal?: AbortSignal +} + +export interface RealtimeToolResult { + toolCallId: string + name: string + result: unknown + isError?: boolean +} + +export interface RealtimeProviderSession { + events: AsyncIterable + updateSession?: (update: unknown) => Promise + appendInputAudio?: ( + chunk: Uint8Array, + meta?: Record + ) => Promise + clearInputAudio?: () => Promise + commitInputAudio?: () => Promise + sendText?: (text: string) => Promise + sendToolResult?: (result: RealtimeToolResult) => Promise + cancelResponse?: () => Promise + truncateOutputAudio?: (opts: { + itemId: string + audioEndMs: number + }) => Promise + close?: (reason?: string) => Promise +} + +export interface RealtimeProviderConfig { + id: string + model: string + connect: ( + input: RealtimeProviderConnectInput + ) => Promise +} + +export interface RealtimeTranscriptEvent { + key: string + sessionId: string + direction: `input` | `output` + text: string + status: `partial` | `final` + turnId?: string + responseId?: string +} + +export interface RealtimeConfig { + systemPrompt: string + provider: RealtimeProviderConfig + tools?: Array + audio?: RealtimeAudioConfig + toolPolicy?: RealtimeToolPolicy + context?: RealtimeContextConfig + session?: RealtimeSessionPolicy + onTranscript?: (transcript: RealtimeTranscriptEvent) => void | Promise + testResponses?: TestResponses +} + +export type RealtimeRunResult = AgentRunResult + +export interface RealtimeHandle { + run: () => Promise + close: (reason?: string) => Promise + stop: (reason?: string) => Promise + cancelResponse: (opts?: { truncateAudio?: boolean }) => Promise + sendText: (text: string) => Promise +} + +export interface RealtimeHelpers { + activeSession: () => ManifestRealtimeSessionEntry | undefined + sessions: () => Array +} + export type TestResponses = Array | TestResponseFn export type TestResponseFn = ( @@ -1018,6 +1253,7 @@ export interface HandlerContext< */ sandbox: Sandbox useAgent: (config: AgentConfig) => AgentHandle + useRealtime: (config: RealtimeConfig) => RealtimeHandle useContext: (config: UseContextConfig) => void timelineMessages: (opts?: TimelineProjectionOpts) => Array insertContext: (id: string, entry: ContextEntryInput) => void @@ -1025,6 +1261,7 @@ export interface HandlerContext< getContext: (id: string) => ContextEntry | undefined listContext: () => Array agent: AgentHandle + realtime: RealtimeHelpers spawn: ( type: string, id: string, diff --git a/packages/agents-runtime/test/electric-agents-client.test.ts b/packages/agents-runtime/test/electric-agents-client.test.ts index 7e60b9c9c3..309d7512df 100644 --- a/packages/agents-runtime/test/electric-agents-client.test.ts +++ b/packages/agents-runtime/test/electric-agents-client.test.ts @@ -8,6 +8,7 @@ const { mockState } = vi.hoisted(() => ({ ensureEntitiesMembershipStream: vi.fn(), ensureCronStream: vi.fn(), signalEntity: vi.fn(), + startRealtimeSession: vi.fn(), ensureStream: vi.fn(), createStreamDB: vi.fn(), preload: vi.fn(), @@ -25,6 +26,7 @@ vi.mock(`../src/runtime-server-client`, () => ({ ensureEntitiesMembershipStream: mockState.ensureEntitiesMembershipStream, ensureCronStream: mockState.ensureCronStream, signalEntity: mockState.signalEntity, + startRealtimeSession: mockState.startRealtimeSession, ensureStream: mockState.ensureStream, }), })) @@ -49,6 +51,20 @@ describe(`createAgentsClient`, () => { mockState.ensureStream = vi.fn().mockResolvedValue(`/_webhooks/repo`) mockState.createStreamDB = vi.fn() mockState.signalEntity = vi.fn().mockResolvedValue({ txid: 123 }) + mockState.startRealtimeSession = vi.fn().mockResolvedValue({ + sessionId: `rt-1`, + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T10:00:00.000Z`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + audio_out: `/horton/demo/realtime/rt-1/audio/out`, + control_in: `/horton/demo/realtime/rt-1/control/in`, + control_out: `/horton/demo/realtime/rt-1/control/out`, + }, + }) mockState.observedDb = { preload: vi.fn().mockResolvedValue(undefined), collections: { @@ -151,6 +167,31 @@ describe(`createAgentsClient`, () => { }) }) + it(`exposes realtime session start through the server client`, async () => { + const client = createAgentsClient({ + baseUrl: `http://electric-agents.test`, + }) + + await expect( + client.startRealtimeSession({ + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + }) + ).resolves.toMatchObject({ + sessionId: `rt-1`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + }, + }) + + expect(mockState.startRealtimeSession).toHaveBeenCalledWith({ + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + }) + }) + it(`observe(webhook(...)) ensures the exact stream before preloading it`, async () => { const client = createAgentsClient({ baseUrl: `http://electric-agents.test/t/tenant-a/v1`, diff --git a/packages/agents-runtime/test/entity-timeline.test.ts b/packages/agents-runtime/test/entity-timeline.test.ts index e5eb60eca2..5550c9aec7 100644 --- a/packages/agents-runtime/test/entity-timeline.test.ts +++ b/packages/agents-runtime/test/entity-timeline.test.ts @@ -3,10 +3,12 @@ import { createCollection, createLiveQueryCollection, } from '@durable-streams/state/db' +import { BasicIndex } from '@tanstack/db' import { buildEntityTimelineData, compareTimelineOrders, createEntityIncludesQuery, + createEntityTimelineQuery, getEntityState, normalizeEntityTimelineData, } from '../src/entity-timeline' @@ -36,6 +38,10 @@ function offset(index: number): EventPointer { } } +function emptyOrderableCollection() { + return { toArray: [], __electricRowOffsets: new Map() } +} + describe(`compareTimelineOrders`, () => { it(`compares two numbers`, () => { expect(compareTimelineOrders(1, 2)).toBeLessThan(0) @@ -1594,6 +1600,10 @@ describe(`entity includes query`, () => { const inbox = createSyncCollection(`test-inbox`, takeOffset) const wakes = createSyncCollection(`test-wakes`, takeOffset) const signals = createSyncCollection(`test-signals`, takeOffset) + const realtimeTranscripts = createSyncCollection( + `test-realtime-transcripts`, + takeOffset + ) const contextInserted = createSyncCollection( `test-context-inserted`, takeOffset @@ -1604,6 +1614,24 @@ describe(`entity includes query`, () => { ) const manifests = createSyncCollection(`test-manifests`, takeOffset) const childStatus = createSyncCollection(`test-child-status`, takeOffset) + texts.collection.createIndex((row) => row.run_id, { + indexType: BasicIndex, + }) + textDeltas.collection.createIndex((row) => row.text_id, { + indexType: BasicIndex, + }) + textDeltas.collection.createIndex((row) => row.run_id, { + indexType: BasicIndex, + }) + toolCalls.collection.createIndex((row) => row.run_id, { + indexType: BasicIndex, + }) + steps.collection.createIndex((row) => row.run_id, { + indexType: BasicIndex, + }) + errors.collection.createIndex((row) => row.run_id, { + indexType: BasicIndex, + }) return { collections: { runs: runs.collection, @@ -1615,6 +1643,7 @@ describe(`entity includes query`, () => { inbox: inbox.collection, wakes: wakes.collection, signals: signals.collection, + realtimeTranscripts: realtimeTranscripts.collection, contextInserted: contextInserted.collection, contextRemoved: contextRemoved.collection, manifests: manifests.collection, @@ -1630,6 +1659,7 @@ describe(`entity includes query`, () => { inbox: withSeqInjection(inbox, takeSeq), wakes: withSeqInjection(wakes, takeSeq), signals: withSeqInjection(signals, takeSeq), + realtimeTranscripts: withSeqInjection(realtimeTranscripts, takeSeq), contextInserted: withSeqInjection(contextInserted, takeSeq), contextRemoved: withSeqInjection(contextRemoved, takeSeq), manifests: withSeqInjection(manifests, takeSeq), @@ -1829,6 +1859,55 @@ describe(`entity includes query`, () => { ) }) + it(`orders live run rows by their first visible item`, async () => { + const { collections, sync } = createEntityCollections() + const queryFn = createEntityTimelineQuery({ collections } as any) + const liveQuery = createLiveQueryCollection({ + query: queryFn, + startSync: true, + }) + await liveQuery.preload() + + sync.runs.insert({ + key: `run-1`, + status: `started`, + _timeline_order: order(1), + }) + sync.realtimeTranscripts.insert({ + key: `rt-in-1`, + session_id: `rt-1`, + direction: `input`, + text: `Find the latest Electric Agents post`, + status: `final`, + audio_stream: `/horton/test/realtime/rt-1/audio/in`, + created_at: `2026-06-09T14:56:00.000Z`, + _timeline_order: order(2), + }) + sync.toolCalls.insert({ + key: `tc-1`, + run_id: `run-1`, + tool_call_id: `tc-1`, + tool_name: `web_search`, + status: `completed`, + args: { query: `most recent blog post Electric Agents site` }, + result: `https://electric.ax/blog/2026/04/29/introducing-electric-agents`, + _timeline_order: order(3), + }) + await new Promise((r) => setTimeout(r, 50)) + + const rows = getData(liveQuery) + expect( + rows.map((row) => + row.realtimeTranscript + ? `realtimeTranscript:${row.realtimeTranscript.key}` + : row.run + ? `run:${row.run.key}` + : `other` + ) + ).toEqual([`realtimeTranscript:rt-in-1`, `run:run-1`]) + expect(rows[1]?.run.order).toBe(order(3)) + }) + it(`projects related entities from one manifest row per related entity`, () => { const timeline = buildEntityTimelineData({ collections: { @@ -1841,6 +1920,7 @@ describe(`entity includes query`, () => { inbox: { toArray: [] }, wakes: { toArray: [] }, signals: { toArray: [] }, + realtimeTranscripts: emptyOrderableCollection(), contextInserted: { toArray: [], __electricRowOffsets: new Map() }, contextRemoved: { toArray: [], __electricRowOffsets: new Map() }, manifests: { @@ -1949,6 +2029,7 @@ describe(`entity includes query`, () => { }, wakes: { toArray: [], __electricRowOffsets: new Map() }, signals: { toArray: [], __electricRowOffsets: new Map() }, + realtimeTranscripts: emptyOrderableCollection(), contextInserted: { toArray: [], __electricRowOffsets: new Map() }, contextRemoved: { toArray: [], __electricRowOffsets: new Map() }, manifests: { toArray: [], __electricRowOffsets: new Map() }, @@ -1974,6 +2055,7 @@ describe(`entity includes query`, () => { inbox: { toArray: [], __electricRowOffsets: new Map() }, wakes: { toArray: [], __electricRowOffsets: new Map() }, signals: { toArray: [], __electricRowOffsets: new Map() }, + realtimeTranscripts: emptyOrderableCollection(), contextInserted: { toArray: [ { @@ -2091,6 +2173,7 @@ describe(`entity includes query`, () => { inbox: { toArray: [] }, wakes: { toArray: [] }, signals: { toArray: [] }, + realtimeTranscripts: emptyOrderableCollection(), contextInserted: { toArray: [], __electricRowOffsets: new Map() }, contextRemoved: { toArray: [], __electricRowOffsets: new Map() }, manifests: { @@ -2200,6 +2283,7 @@ describe(`entity includes query`, () => { inbox: { toArray: [] }, wakes: { toArray: [] }, signals: { toArray: [] }, + realtimeTranscripts: emptyOrderableCollection(), contextInserted: { toArray: [], __electricRowOffsets: new Map() }, contextRemoved: { toArray: [], __electricRowOffsets: new Map() }, manifests: { diff --git a/packages/agents-runtime/test/helpers/context-test-helpers.ts b/packages/agents-runtime/test/helpers/context-test-helpers.ts index 2aa60e55fd..78faad4d6d 100644 --- a/packages/agents-runtime/test/helpers/context-test-helpers.ts +++ b/packages/agents-runtime/test/helpers/context-test-helpers.ts @@ -304,6 +304,10 @@ export function createTestHandlerContext( wakeEvent?: WakeEvent hydratedEventSourceWake?: HydratedEventSourceWake | null prepareAgentRun?: () => Promise + realtimeStreams?: { + baseUrl: string + headers?: Record + } } = {} ) { const db = opts.db ?? buildStreamFixture([]) @@ -334,6 +338,7 @@ export function createTestHandlerContext( payload: `hi`, }, hydratedEventSourceWake: opts.hydratedEventSourceWake, + realtimeStreams: opts.realtimeStreams, prepareAgentRun: opts.prepareAgentRun, doObserve: vi.fn(), doSpawn: vi.fn(), diff --git a/packages/agents-runtime/test/openai-realtime.test.ts b/packages/agents-runtime/test/openai-realtime.test.ts new file mode 100644 index 0000000000..b7e0735d71 --- /dev/null +++ b/packages/agents-runtime/test/openai-realtime.test.ts @@ -0,0 +1,740 @@ +import { Type } from '@sinclair/typebox' +import { describe, expect, it, vi } from 'vitest' +import { createOpenAIRealtimeProvider } from '../src/openai-realtime' +import type { AgentTool, RealtimeProviderEvent } from '../src/types' + +type Listener = (...args: Array) => void + +class FakeWebSocket { + static instances: Array = [] + + readonly sent: Array = [] + readonly listeners = new Map>() + + constructor( + readonly url: string, + readonly init?: unknown + ) { + FakeWebSocket.instances.push(this) + queueMicrotask(() => this.emit(`open`)) + } + + addEventListener(event: string, listener: Listener): void { + const listeners = this.listeners.get(event) ?? [] + listeners.push(listener) + this.listeners.set(event, listeners) + } + + send(data: string): void { + this.sent.push(JSON.parse(data) as unknown) + } + + close(): void { + this.emit(`close`) + } + + emit(event: string, payload?: unknown): void { + for (const listener of this.listeners.get(event) ?? []) { + listener(payload) + } + } + + emitMessage(payload: unknown): void { + this.emit(`message`, { data: JSON.stringify(payload) }) + } +} + +function nextEvent(iterator: AsyncIterator) { + return iterator.next().then((result) => result.value) +} + +describe(`createOpenAIRealtimeProvider`, () => { + it(`connects over WebSocket and configures session state`, async () => { + FakeWebSocket.instances = [] + const tool: AgentTool = { + name: `lookup`, + label: `Lookup`, + description: `Look up a value`, + parameters: Type.Object({ q: Type.String() }), + execute: vi.fn(), + } + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + voice: `marin`, + reasoningEffort: `medium`, + safetyIdentifier: `user-1`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `You are Horton.`, + messages: [{ role: `user`, content: `Previous context` } as never], + tools: [tool], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + outputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.url).toBe( + `wss://api.openai.com/v1/realtime?model=gpt-realtime-2` + ) + expect(socket.init).toEqual({ + headers: { + Authorization: `Bearer sk-test`, + 'OpenAI-Safety-Identifier': `user-1`, + }, + }) + expect(socket.sent[0]).toMatchObject({ + type: `session.update`, + session: { + type: `realtime`, + model: `gpt-realtime-2`, + instructions: `You are Horton.`, + reasoning: { effort: `medium` }, + output_modalities: [`audio`], + tool_choice: `auto`, + tools: [ + { + type: `function`, + name: `lookup`, + description: `Look up a value`, + }, + ], + audio: { + input: { + format: { type: `audio/pcm`, rate: 24_000 }, + transcription: { model: `gpt-4o-mini-transcribe` }, + turn_detection: { + type: `server_vad`, + threshold: 0.55, + prefix_padding_ms: 300, + silence_duration_ms: 500, + create_response: true, + interrupt_response: true, + }, + }, + output: { + format: { type: `audio/pcm`, rate: 24_000 }, + voice: `marin`, + }, + }, + }, + }) + expect(socket.sent[1]).toEqual({ + type: `conversation.item.create`, + item: { + type: `message`, + role: `user`, + content: [{ type: `input_text`, text: `Previous context` }], + }, + }) + }) + + it(`does not send reasoning effort to non-reasoning realtime models`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + model: `gpt-realtime-1.5`, + reasoningEffort: `low`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `You are Horton.`, + messages: [], + tools: [], + audio: { + outputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect((socket.sent[0] as any).session.reasoning).toBeUndefined() + }) + + it(`requests audio output when a voice is configured without an output format`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + voice: `marin`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.sent[0]).toMatchObject({ + type: `session.update`, + session: { + output_modalities: [`audio`], + audio: { + output: { + voice: `marin`, + }, + }, + }, + }) + }) + + it(`can disable input audio transcription`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + inputTranscription: false, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.sent[0]).toMatchObject({ + session: { + audio: { + input: { + format: { type: `audio/pcm`, rate: 24_000 }, + }, + }, + }, + }) + expect( + (socket.sent[0] as any).session.audio.input.transcription + ).toBeUndefined() + }) + + it(`maps input transcription delay for low latency captions`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + inputTranscription: { + model: `gpt-realtime-whisper`, + delay: `minimal`, + }, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.sent[0]).toMatchObject({ + session: { + audio: { + input: { + transcription: { + model: `gpt-realtime-whisper`, + delay: `minimal`, + }, + }, + }, + }, + }) + }) + + it(`can disable realtime turn detection for manual audio commits`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + turnDetection: { type: `none` }, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.sent[0]).toMatchObject({ + session: { + audio: { + input: { + turn_detection: null, + }, + }, + }, + }) + }) + + it(`maps realtime server VAD configuration`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + audio: { + inputFormat: { codec: `pcm16`, sampleRate: 24_000, channels: 1 }, + turnDetection: { + type: `server_vad`, + threshold: 0.7, + prefixPaddingMs: 250, + silenceDurationMs: 650, + createResponse: false, + interruptResponse: false, + }, + }, + }) + + const socket = FakeWebSocket.instances[0]! + expect(socket.sent[0]).toMatchObject({ + session: { + audio: { + input: { + turn_detection: { + type: `server_vad`, + threshold: 0.7, + prefix_padding_ms: 250, + silence_duration_ms: 650, + create_response: false, + interrupt_response: false, + }, + }, + }, + }, + }) + }) + + it(`sends audio input chunks as OpenAI input buffer events`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + + await session.appendInputAudio?.(new Uint8Array([1, 2, 3, 4])) + await session.clearInputAudio?.() + await session.commitInputAudio?.() + + expect(socket.sent.at(-4)).toEqual({ + type: `input_audio_buffer.append`, + audio: `AQIDBA==`, + }) + expect(socket.sent.at(-3)).toEqual({ type: `input_audio_buffer.clear` }) + expect(socket.sent.at(-2)).toEqual({ type: `input_audio_buffer.commit` }) + expect(socket.sent.at(-1)).toEqual({ type: `response.create` }) + }) + + it(`normalizes audio input chunks before appending them`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + + await session.appendInputAudio?.(new Uint8Array()) + await session.appendInputAudio?.(new Uint8Array([1])) + await session.appendInputAudio?.(new Uint8Array([1, 2, 3])) + + const large = new Uint8Array(32 * 1024 + 4) + large.fill(7) + await session.appendInputAudio?.(large) + + const appendEvents = socket.sent.filter( + (event): event is { type: string; audio: string } => + typeof event === `object` && + event !== null && + (event as { type?: unknown }).type === `input_audio_buffer.append` + ) + expect(appendEvents).toHaveLength(3) + expect(appendEvents[0]!.audio).toBe(`AQI=`) + expect(Buffer.from(appendEvents[1]!.audio, `base64`)).toHaveLength( + 32 * 1024 + ) + expect(Buffer.from(appendEvents[2]!.audio, `base64`)).toHaveLength(4) + }) + + it(`unblocks the event stream when the run signal aborts`, async () => { + FakeWebSocket.instances = [] + const controller = new AbortController() + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + signal: controller.signal, + }) + const iterator = session.events[Symbol.asyncIterator]() + + controller.abort() + + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `session.closed`, + reason: `aborted`, + }) + }) + + it(`surfaces unexpected WebSocket closes as provider errors`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emit(`close`, { code: 1008, reason: `invalid model` }) + + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `session.error`, + code: `websocket_closed`, + error: + `OpenAI realtime WebSocket closed before client stop ` + + `code=1008 reason=invalid model`, + }) + }) + + it(`can truncate output audio for interrupted playback`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + + await session.truncateOutputAudio?.({ + itemId: `item-1`, + audioEndMs: 320, + }) + + expect(socket.sent.at(-1)).toEqual({ + type: `conversation.item.truncate`, + item_id: `item-1`, + content_index: 0, + audio_end_ms: 320, + }) + }) + + it(`maps GA output audio and transcript events`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ + type: `response.output_audio.delta`, + response_id: `resp-1`, + item_id: `item-1`, + delta: `AQID`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `output_audio.delta`, + responseId: `resp-1`, + itemId: `item-1`, + audio: new Uint8Array([1, 2, 3]), + }) + + socket.emitMessage({ + type: `response.output_audio_transcript.delta`, + response_id: `resp-1`, + item_id: `item-1`, + content_index: 0, + delta: `hello`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `output_transcript.delta`, + responseId: `resp-1`, + itemId: `item-1`, + contentIndex: 0, + transcriptSource: `response.output_audio_transcript`, + delta: `hello`, + }) + + socket.emitMessage({ + type: `response.output_audio.done`, + response_id: `resp-1`, + item_id: `item-1`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `output_audio.completed`, + responseId: `resp-1`, + itemId: `item-1`, + }) + }) + + it(`maps GA input audio transcript events`, async () => { + FakeWebSocket.instances = [] + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ + type: `input_audio_buffer.speech_started`, + item_id: `item-1`, + audio_start_ms: 120, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_audio.speech_started`, + turnId: `item-1`, + audioOffset: `120`, + }) + + socket.emitMessage({ + type: `input_audio_buffer.speech_stopped`, + item_id: `item-1`, + audio_end_ms: 860, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_audio.speech_stopped`, + turnId: `item-1`, + audioOffset: `860`, + }) + + socket.emitMessage({ + type: `input_audio_buffer.committed`, + item_id: `item-1`, + previous_item_id: `previous-item`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_audio.committed`, + turnId: `item-1`, + previousTurnId: `previous-item`, + }) + + socket.emitMessage({ + type: `conversation.item.input_audio_transcription.delta`, + item_id: `item-1`, + delta: `hello`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_transcript.delta`, + turnId: `item-1`, + delta: `hello`, + }) + + socket.emitMessage({ + type: `conversation.item.input_audio_transcription.completed`, + item_id: `item-1`, + transcript: `hello there`, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `input_transcript.completed`, + turnId: `item-1`, + text: `hello there`, + }) + }) + + it(`maps OpenAI events and executes function calls`, async () => { + FakeWebSocket.instances = [] + const execute = vi.fn().mockResolvedValue({ + content: [{ type: `text`, text: `done` }], + details: { ok: true }, + }) + const tool: AgentTool = { + name: `lookup`, + label: `Lookup`, + description: `Look up a value`, + parameters: Type.Object({ q: Type.String() }), + execute, + } + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [tool], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ type: `session.created`, session: { id: `sess-1` } }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `session.started`, + sessionId: `sess-1`, + }) + + socket.emitMessage({ + type: `response.output_item.added`, + item: { + type: `function_call`, + id: `fc-1`, + call_id: `call-1`, + name: `lookup`, + }, + }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `tool_call.started`, + toolCallId: `call-1`, + name: `lookup`, + }) + + socket.emitMessage({ + type: `response.function_call_arguments.done`, + call_id: `call-1`, + name: `lookup`, + arguments: JSON.stringify({ q: `status` }), + }) + + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `tool_call.arguments_completed`, + toolCallId: `call-1`, + name: `lookup`, + args: { q: `status` }, + }) + await expect(nextEvent(iterator)).resolves.toMatchObject({ + type: `tool_call.completed`, + toolCallId: `call-1`, + name: `lookup`, + }) + expect(execute).toHaveBeenCalledWith(`call-1`, { q: `status` }, undefined) + expect(socket.sent.at(-2)).toMatchObject({ + type: `conversation.item.create`, + item: { + type: `function_call_output`, + call_id: `call-1`, + }, + }) + expect(socket.sent.at(-1)).toEqual({ type: `response.create` }) + }) + + it(`does not send tool results for a cancelled response`, async () => { + FakeWebSocket.instances = [] + let resolveTool: (value: { + content: Array<{ type: `text`; text: string }> + details: Record + }) => void = () => undefined + const execute = vi.fn( + () => + new Promise<{ + content: Array<{ type: `text`; text: string }> + details: Record + }>((resolve) => { + resolveTool = resolve + }) + ) + const tool: AgentTool = { + name: `lookup`, + label: `Lookup`, + description: `Look up a value`, + parameters: Type.Object({ q: Type.String() }), + execute, + } + const provider = createOpenAIRealtimeProvider({ + apiKey: `sk-test`, + WebSocket: FakeWebSocket, + }) + + const session = await provider.connect({ + systemPrompt: `Talk`, + messages: [], + tools: [tool], + }) + const socket = FakeWebSocket.instances[0]! + const iterator = session.events[Symbol.asyncIterator]() + + socket.emitMessage({ type: `response.created`, response: { id: `resp-1` } }) + await expect(nextEvent(iterator)).resolves.toEqual({ + type: `response.started`, + responseId: `resp-1`, + }) + + socket.emitMessage({ + type: `response.function_call_arguments.done`, + call_id: `call-1`, + name: `lookup`, + arguments: JSON.stringify({ q: `status` }), + }) + await expect(nextEvent(iterator)).resolves.toMatchObject({ + type: `tool_call.arguments_completed`, + toolCallId: `call-1`, + }) + expect(execute).toHaveBeenCalledWith(`call-1`, { q: `status` }, undefined) + + await session.cancelResponse?.() + resolveTool({ content: [{ type: `text`, text: `done` }], details: {} }) + + await expect(nextEvent(iterator)).resolves.toMatchObject({ + type: `tool_call.completed`, + toolCallId: `call-1`, + }) + expect(socket.sent).toContainEqual({ type: `response.cancel` }) + expect(socket.sent).not.toEqual( + expect.arrayContaining([ + expect.objectContaining({ + type: `conversation.item.create`, + item: expect.objectContaining({ + type: `function_call_output`, + call_id: `call-1`, + }), + }), + ]) + ) + }) +}) diff --git a/packages/agents-runtime/test/realtime-context.test.ts b/packages/agents-runtime/test/realtime-context.test.ts new file mode 100644 index 0000000000..68cd87c12a --- /dev/null +++ b/packages/agents-runtime/test/realtime-context.test.ts @@ -0,0 +1,1076 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest' +import { createTestRealtimeProvider } from '../src/realtime' +import { + buildStreamFixture, + createTestHandlerContext, +} from './helpers/context-test-helpers' +import type { ChangeEvent } from '@durable-streams/state' + +const durableMock = vi.hoisted(() => { + type StreamSource = Iterable | AsyncIterable + const appends: Array<{ url: string; data: unknown }> = [] + const bodyStreams = new Map>() + const jsonStreams = new Map>() + class DurableStream { + constructor(readonly opts: { url: string }) {} + + async append(data: unknown): Promise { + appends.push({ url: this.opts.url, data }) + } + + async stream() { + const url = this.opts.url + return { + bodyStream: async function* () { + for await (const chunk of bodyStreams.get(url) ?? []) { + yield chunk + } + }, + jsonStream: async function* () { + for await (const event of jsonStreams.get(url) ?? []) { + yield event + } + }, + cancel: vi.fn(), + } + } + } + + return { appends, bodyStreams, jsonStreams, DurableStream } +}) + +vi.mock(`@durable-streams/client`, () => ({ + DurableStream: durableMock.DurableStream, +})) + +describe(`ctx.useRealtime()`, () => { + beforeEach(() => { + durableMock.appends.length = 0 + durableMock.bodyStreams.clear() + durableMock.jsonStreams.clear() + }) + + it(`records provider transcript output as realtime transcript rows`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ response: `hello from voice` }), + tools: [], + }) + + await realtime.run() + + expect(ctx.db.collections.runs.toArray).toMatchObject([ + { key: `run-0`, status: `completed`, finish_reason: `stop` }, + ]) + expect(ctx.db.collections.steps.toArray).toMatchObject([ + { + key: `step-0`, + run_id: `run-0`, + model_provider: `test`, + model_id: `test-realtime`, + status: `completed`, + finish_reason: `stop`, + }, + ]) + expect(ctx.db.collections.textDeltas.toArray).toMatchObject([ + { + key: `realtime-transcript:ephemeral:output:fallback-0:delta-0`, + text_id: `realtime-transcript:ephemeral:output:fallback-0`, + realtime_transcript_id: `realtime-transcript:ephemeral:output:fallback-0`, + delta: `hello from voice`, + }, + ]) + expect(ctx.db.collections.realtimeTranscripts.toArray).toMatchObject([ + { + direction: `output`, + text: `hello from voice`, + status: `final`, + }, + ]) + }) + + it(`persists realtime input and output transcripts`, async () => { + const { ctx } = createTestHandlerContext() + const transcriptEvents: Array<{ + direction: `input` | `output` + text: string + status: `partial` | `final` + turnId?: string + responseId?: string + }> = [] + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `provider-session` }, + { + type: `input_transcript.delta`, + delta: `hel`, + turnId: `input-item-1`, + }, + { + type: `input_transcript.delta`, + delta: `lo`, + turnId: `input-item-1`, + }, + { + type: `input_transcript.completed`, + text: `hello there`, + turnId: `input-item-1`, + }, + { + type: `output_transcript.delta`, + delta: `Hi`, + responseId: `resp-1`, + }, + { + type: `output_transcript.completed`, + text: `Hi there`, + responseId: `resp-1`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + onTranscript: (event) => { + transcriptEvents.push(event) + }, + }) + + await realtime.run() + + expect(ctx.db.collections.realtimeTranscripts.toArray).toMatchObject([ + { + key: `realtime-transcript:provider-session:input:input-item-1`, + session_id: `provider-session`, + direction: `input`, + text: `hello there`, + status: `final`, + turn_id: `input-item-1`, + audio_stream: `input`, + created_at: expect.any(String), + }, + { + key: `realtime-transcript:provider-session:output:resp-1`, + session_id: `provider-session`, + direction: `output`, + text: `Hi there`, + status: `final`, + response_id: `resp-1`, + audio_stream: `output`, + created_at: expect.any(String), + }, + ]) + expect(ctx.db.collections.textDeltas.toArray).toMatchObject([ + { + key: `realtime-transcript:provider-session:input:input-item-1:delta-0`, + text_id: `realtime-transcript:provider-session:input:input-item-1`, + realtime_transcript_id: `realtime-transcript:provider-session:input:input-item-1`, + delta: `hel`, + }, + { + key: `realtime-transcript:provider-session:input:input-item-1:delta-1`, + text_id: `realtime-transcript:provider-session:input:input-item-1`, + realtime_transcript_id: `realtime-transcript:provider-session:input:input-item-1`, + delta: `lo`, + }, + { + key: `realtime-transcript:provider-session:input:input-item-1:delta-2`, + text_id: `realtime-transcript:provider-session:input:input-item-1`, + realtime_transcript_id: `realtime-transcript:provider-session:input:input-item-1`, + delta: ` there`, + }, + { + key: `realtime-transcript:provider-session:output:resp-1:delta-0`, + text_id: `realtime-transcript:provider-session:output:resp-1`, + realtime_transcript_id: `realtime-transcript:provider-session:output:resp-1`, + delta: `Hi`, + }, + { + key: `realtime-transcript:provider-session:output:resp-1:delta-1`, + text_id: `realtime-transcript:provider-session:output:resp-1`, + realtime_transcript_id: `realtime-transcript:provider-session:output:resp-1`, + delta: ` there`, + }, + ]) + expect(transcriptEvents).toEqual( + expect.arrayContaining([ + expect.objectContaining({ + direction: `input`, + text: `hello there`, + status: `final`, + turnId: `input-item-1`, + }), + expect.objectContaining({ + direction: `output`, + text: `Hi there`, + status: `final`, + responseId: `resp-1`, + }), + ]) + ) + }) + + it(`uses one output transcript source family per response`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `provider-session` }, + { + type: `output_transcript.delta`, + delta: `Text duplicate`, + responseId: `resp-1`, + itemId: `item-1`, + transcriptSource: `response.output_text`, + }, + { + type: `output_transcript.delta`, + delta: `Audio transcript`, + responseId: `resp-1`, + itemId: `item-1`, + transcriptSource: `response.output_audio_transcript`, + }, + { + type: `output_transcript.delta`, + delta: ` ignored`, + responseId: `resp-1`, + itemId: `item-1`, + transcriptSource: `response.output_text`, + }, + { + type: `output_transcript.completed`, + text: `Audio transcript final`, + responseId: `resp-1`, + itemId: `item-1`, + transcriptSource: `response.output_audio_transcript`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await realtime.run() + + expect( + ctx.db.collections.realtimeTranscripts.get( + `realtime-transcript:provider-session:output:resp-1` + ) + ).toMatchObject({ + direction: `output`, + text: `Audio transcript final`, + status: `final`, + }) + }) + + it(`does not seed active realtime session transcripts into provider history`, async () => { + const { ctx } = createTestHandlerContext() + const capturedMessages: Array = [] + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `test`, + model: `test-realtime`, + status: `requested`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + ctx.db.collections.realtimeTranscripts.insert({ + key: `rt-active`, + session_id: `rt-1`, + direction: `input`, + text: `active session text`, + status: `final`, + audio_stream: `input`, + created_at: `2026-06-09T12:00:01.000Z`, + }) + ctx.db.collections.realtimeTranscripts.insert({ + key: `rt-prior`, + session_id: `rt-prior`, + direction: `input`, + text: `prior session text`, + status: `final`, + audio_stream: `input`, + created_at: `2026-06-09T11:00:01.000Z`, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `test`, + model: `test-realtime`, + async connect(input) { + capturedMessages.push(...input.messages) + return { + events: (async function* () { + yield { type: `session.started` as const, sessionId: `rt-1` } + yield { type: `session.closed` as const } + })(), + } + }, + }, + tools: [], + }) + + await realtime.run() + + expect(capturedMessages).toEqual([ + { role: `user`, content: `prior session text` }, + ]) + }) + + it(`anchors delayed input transcripts at speech start`, async () => { + const db = buildStreamFixture([]) + const events: Array = [] + const { ctx } = createTestHandlerContext({ + db, + writeEvent: (event) => { + events.push(event) + db.utils.applyEvent(event) + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `provider-session` }, + { type: `input_audio.speech_started`, turnId: `input-item-1` }, + { + type: `output_transcript.delta`, + delta: `Hi`, + responseId: `resp-1`, + }, + { + type: `output_transcript.completed`, + text: `Hi there`, + responseId: `resp-1`, + }, + { + type: `input_transcript.completed`, + text: `hello there`, + turnId: `input-item-1`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await realtime.run() + + const transcriptEvents = events.filter( + (event) => + event.type === `realtime_transcript` && + event.key === `realtime-transcript:provider-session:input:input-item-1` + ) + expect(transcriptEvents).toHaveLength(2) + expect(transcriptEvents[0]).toMatchObject({ + headers: { operation: `insert` }, + value: { + direction: `input`, + text: ``, + status: `partial`, + }, + }) + expect(transcriptEvents[1]).toMatchObject({ + headers: { operation: `update` }, + value: { + direction: `input`, + text: `hello there`, + status: `final`, + }, + }) + + const inputTranscriptInsertIndex = events.findIndex( + (event) => event === transcriptEvents[0] + ) + const firstAssistantTranscriptIndex = events.findIndex( + (event) => + event.type === `realtime_transcript` && + event.key === `realtime-transcript:provider-session:output:resp-1` && + event.headers.operation === `insert` + ) + expect(inputTranscriptInsertIndex).toBeGreaterThanOrEqual(0) + expect(firstAssistantTranscriptIndex).toBeGreaterThanOrEqual(0) + expect(inputTranscriptInsertIndex).toBeLessThan( + firstAssistantTranscriptIndex + ) + }) + + it(`splits output transcripts around later input speech`, async () => { + const db = buildStreamFixture([]) + const events: Array = [] + const { ctx } = createTestHandlerContext({ + db, + writeEvent: (event) => { + events.push(event) + db.utils.applyEvent(event) + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `provider-session` }, + { + type: `output_transcript.delta`, + delta: `Hello `, + responseId: `resp-1`, + }, + { type: `input_audio.speech_started`, turnId: `input-item-1` }, + { + type: `output_transcript.delta`, + delta: `there`, + responseId: `resp-1`, + }, + { + type: `input_transcript.completed`, + text: `interrupting`, + turnId: `input-item-1`, + }, + { + type: `output_transcript.completed`, + text: `Hello there`, + responseId: `resp-1`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await realtime.run() + + expect( + ctx.db.collections.realtimeTranscripts.get( + `realtime-transcript:provider-session:output:resp-1` + ) + ).toMatchObject({ + direction: `output`, + text: `Hello `, + status: `final`, + }) + expect( + ctx.db.collections.realtimeTranscripts.get( + `realtime-transcript:provider-session:input:input-item-1` + ) + ).toMatchObject({ + direction: `input`, + text: `interrupting`, + status: `final`, + }) + expect( + ctx.db.collections.realtimeTranscripts.get( + `realtime-transcript:provider-session:output:resp-1:segment-1` + ) + ).toMatchObject({ + direction: `output`, + text: `there`, + status: `final`, + }) + + const firstOutputInsertIndex = events.findIndex( + (event) => + event.type === `realtime_transcript` && + event.key === `realtime-transcript:provider-session:output:resp-1` && + event.headers.operation === `insert` + ) + const inputInsertIndex = events.findIndex( + (event) => + event.type === `realtime_transcript` && + event.key === + `realtime-transcript:provider-session:input:input-item-1` && + event.headers.operation === `insert` + ) + const secondOutputInsertIndex = events.findIndex( + (event) => + event.type === `realtime_transcript` && + event.key === + `realtime-transcript:provider-session:output:resp-1:segment-1` && + event.headers.operation === `insert` + ) + expect(firstOutputInsertIndex).toBeGreaterThanOrEqual(0) + expect(inputInsertIndex).toBeGreaterThan(firstOutputInsertIndex) + expect(secondOutputInsertIndex).toBeGreaterThan(inputInsertIndex) + }) + + it(`finds active realtime sessions from the manifest`, () => { + const { ctx } = createTestHandlerContext() + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + + expect(ctx.realtime.activeSession()).toMatchObject({ + id: `rt-1`, + status: `active`, + }) + }) + + it(`marks realtime sessions closed when the provider stream ends`, async () => { + const { ctx } = createTestHandlerContext() + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ response: `done` }), + tools: [], + }) + + await realtime.run() + + expect(ctx.realtime.activeSession()).toBeUndefined() + expect( + ctx.db.collections.manifests.get(`realtime-session:rt-1`) + ).toMatchObject({ + status: `closed`, + endedAt: expect.any(String), + meta: { reason: `completed` }, + }) + expect( + ctx.db.collections.realtimeSessions.get(`realtime-session:rt-1`) + ).toMatchObject({ + status: `closed`, + ended_at: expect.any(String), + reason: `completed`, + }) + }) + + it(`marks realtime sessions failed when provider setup fails`, async () => { + const { ctx } = createTestHandlerContext() + + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/entities/test/realtime/rt-1/audio/in`, + audio_out: `/entities/test/realtime/rt-1/audio/out`, + control_in: `/entities/test/realtime/rt-1/control/in`, + control_out: `/entities/test/realtime/rt-1/control/out`, + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `openai`, + model: `gpt-realtime-2`, + connect: async () => { + throw new Error(`missing key`) + }, + }, + tools: [], + }) + + await expect(realtime.run()).rejects.toThrow(`missing key`) + expect(ctx.realtime.activeSession()).toBeUndefined() + expect( + ctx.db.collections.manifests.get(`realtime-session:rt-1`) + ).toMatchObject({ + status: `failed`, + endedAt: expect.any(String), + meta: { error: `missing key` }, + }) + }) + + it(`does not fail the run when OpenAI reports inactive response cancellation`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started` }, + { + type: `session.error`, + code: `response_cancel_not_active`, + error: `Cancellation failed: no active response found`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await expect(realtime.run()).resolves.toMatchObject({ + usage: { tokens: 0 }, + }) + expect(ctx.db.collections.runs.toArray).toMatchObject([ + { status: `completed`, finish_reason: `stop` }, + ]) + }) + + it(`does not fail the run when OpenAI reports a stale output audio truncate`, async () => { + const { ctx } = createTestHandlerContext() + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started` }, + { + type: `session.error`, + code: `invalid_value`, + error: `Audio content of 6350ms is already shorter than 8160ms`, + }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await expect(realtime.run()).resolves.toMatchObject({ + usage: { tokens: 0 }, + }) + expect(ctx.db.collections.runs.toArray).toMatchObject([ + { status: `completed`, finish_reason: `stop` }, + ]) + }) + + it(`persists provider audio and control output to realtime durable streams`, async () => { + const { ctx } = createTestHandlerContext({ + realtimeStreams: { + baseUrl: `http://server.test`, + headers: { authorization: `Bearer claim` }, + }, + }) + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/test/entity/realtime/rt-1/audio/in`, + audio_out: `/test/entity/realtime/rt-1/audio/out`, + control_in: `/test/entity/realtime/rt-1/control/in`, + control_out: `/test/entity/realtime/rt-1/control/out`, + }, + }) + + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: createTestRealtimeProvider({ + events: [ + { type: `session.started`, sessionId: `rt-1` }, + { + type: `output_audio.delta`, + audio: new Uint8Array([1, 2, 3]), + responseId: `resp-1`, + itemId: `item-1`, + }, + { type: `output_audio.completed`, responseId: `resp-1` }, + { type: `session.closed` }, + ], + }), + tools: [], + }) + + await realtime.run() + + expect(durableMock.appends).toEqual([ + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/audio/out`, + data: new Uint8Array([1, 2, 3]), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + { + url: `http://server.test/test/entity/realtime/rt-1/control/out`, + data: expect.any(Uint8Array), + }, + ]) + const decoder = new TextDecoder() + expect( + JSON.parse(decoder.decode(durableMock.appends[2]!.data as Uint8Array)) + ).toEqual({ + type: `output_audio.delta`, + responseId: `resp-1`, + itemId: `item-1`, + byteLength: 3, + }) + expect(ctx.db.collections.realtimeAudioSpans.toArray).toMatchObject([ + { + session_id: `rt-1`, + stream: `output`, + producer_id: `/test/entity/realtime/rt-1/audio/out`, + seq: 0, + byte_start: 0, + byte_end: 3, + byte_length: 3, + sample_start: 0, + sample_count: 1, + sample_rate: 24_000, + channels: 1, + codec: `pcm16`, + timing_source: `provider`, + participant_id: `assistant`, + provider_item_id: `item-1`, + response_id: `resp-1`, + }, + ]) + }) + + it(`skips realtime input audio commits below the provider minimum`, async () => { + const { ctx } = createTestHandlerContext({ + realtimeStreams: { + baseUrl: `http://server.test`, + headers: { authorization: `Bearer claim` }, + }, + }) + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/test/entity/realtime/rt-1/audio/in`, + audio_out: `/test/entity/realtime/rt-1/audio/out`, + control_in: `/test/entity/realtime/rt-1/control/in`, + control_out: `/test/entity/realtime/rt-1/control/out`, + }, + }) + + durableMock.bodyStreams.set( + `http://server.test/test/entity/realtime/rt-1/audio/in`, + [new Uint8Array(2048)] + ) + durableMock.jsonStreams.set( + `http://server.test/test/entity/realtime/rt-1/control/in`, + [ + { type: `input_audio.commit`, afterAudioBytes: 2048 }, + { type: `session.close`, reason: `test` }, + ] + ) + + const appendInputAudio = vi.fn() + const clearInputAudio = vi.fn() + const commitInputAudio = vi.fn() + const close = vi.fn() + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `test`, + model: `test-realtime`, + connect: async () => ({ + appendInputAudio, + clearInputAudio, + commitInputAudio, + close, + events: (async function* () { + yield { type: `session.started` as const, sessionId: `rt-1` } + await new Promise((resolve) => setTimeout(resolve, 20)) + yield { type: `session.closed` as const } + })(), + }), + }, + tools: [], + audio: { + turnDetection: { type: `none` }, + }, + }) + + await realtime.run() + + expect(appendInputAudio).not.toHaveBeenCalled() + expect(clearInputAudio).toHaveBeenCalledTimes(1) + expect(commitInputAudio).not.toHaveBeenCalled() + expect(close).toHaveBeenCalledWith(`test`) + }) + + it(`commits only the requested realtime input audio byte range`, async () => { + const { ctx } = createTestHandlerContext({ + realtimeStreams: { + baseUrl: `http://server.test`, + headers: { authorization: `Bearer claim` }, + }, + }) + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/test/entity/realtime/rt-1/audio/in`, + audio_out: `/test/entity/realtime/rt-1/audio/out`, + control_in: `/test/entity/realtime/rt-1/control/in`, + control_out: `/test/entity/realtime/rt-1/control/out`, + }, + }) + + const firstTurnAudio = new Uint8Array(4800).fill(1) + const secondTurnAudio = new Uint8Array(4800).fill(2) + durableMock.bodyStreams.set( + `http://server.test/test/entity/realtime/rt-1/audio/in`, + [firstTurnAudio, secondTurnAudio] + ) + durableMock.jsonStreams.set( + `http://server.test/test/entity/realtime/rt-1/control/in`, + [ + { type: `input_audio.commit`, afterAudioBytes: 4800 }, + { type: `input_audio.commit`, afterAudioBytes: 9600 }, + { type: `session.close`, reason: `test` }, + ] + ) + + const appendInputAudio = vi.fn() + const commitInputAudio = vi.fn() + const close = vi.fn() + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `test`, + model: `test-realtime`, + connect: async () => ({ + appendInputAudio, + commitInputAudio, + close, + events: (async function* () { + yield { type: `session.started` as const, sessionId: `rt-1` } + await new Promise((resolve) => setTimeout(resolve, 20)) + yield { type: `session.closed` as const } + })(), + }), + }, + tools: [], + audio: { + turnDetection: { type: `none` }, + }, + }) + + await realtime.run() + + expect(appendInputAudio).toHaveBeenNthCalledWith(1, firstTurnAudio) + expect(appendInputAudio).toHaveBeenNthCalledWith(2, secondTurnAudio) + expect(commitInputAudio).toHaveBeenCalledTimes(2) + expect(close).toHaveBeenCalledWith(`test`) + }) + + it(`streams realtime input audio directly when provider VAD is enabled`, async () => { + const { ctx } = createTestHandlerContext({ + realtimeStreams: { + baseUrl: `http://server.test`, + headers: { authorization: `Bearer claim` }, + }, + }) + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/test/entity/realtime/rt-1/audio/in`, + audio_out: `/test/entity/realtime/rt-1/audio/out`, + control_in: `/test/entity/realtime/rt-1/control/in`, + control_out: `/test/entity/realtime/rt-1/control/out`, + }, + }) + + const firstChunk = new Uint8Array(2048).fill(1) + const secondChunk = new Uint8Array(2048).fill(2) + durableMock.bodyStreams.set( + `http://server.test/test/entity/realtime/rt-1/audio/in`, + [firstChunk, secondChunk] + ) + durableMock.jsonStreams.set( + `http://server.test/test/entity/realtime/rt-1/control/in`, + (async function* () { + await new Promise((resolve) => setTimeout(resolve, 20)) + yield { type: `session.close`, reason: `test` } + })() + ) + + const appendInputAudio = vi.fn() + const commitInputAudio = vi.fn() + const close = vi.fn() + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `test`, + model: `test-realtime`, + connect: async () => ({ + appendInputAudio, + commitInputAudio, + close, + events: (async function* () { + yield { type: `session.started` as const, sessionId: `rt-1` } + await new Promise((resolve) => setTimeout(resolve, 20)) + yield { type: `session.closed` as const } + })(), + }), + }, + tools: [], + }) + + await realtime.run() + + expect(appendInputAudio).toHaveBeenNthCalledWith(1, firstChunk) + expect(appendInputAudio).toHaveBeenNthCalledWith(2, secondChunk) + expect(commitInputAudio).not.toHaveBeenCalled() + expect(close).toHaveBeenCalledWith(`test`) + expect(ctx.db.collections.realtimeAudioSpans.toArray).toMatchObject([ + { + session_id: `rt-1`, + stream: `input`, + producer_id: `/test/entity/realtime/rt-1/audio/in`, + seq: 0, + byte_start: 0, + byte_end: 4096, + byte_length: 4096, + sample_start: 0, + sample_count: 2048, + sample_rate: 24_000, + channels: 1, + codec: `pcm16`, + timing_source: `runtime`, + participant_id: `user`, + }, + ]) + }) + + it(`does not block later realtime control commands behind pending audio bytes`, async () => { + const { ctx } = createTestHandlerContext({ + realtimeStreams: { + baseUrl: `http://server.test`, + headers: { authorization: `Bearer claim` }, + }, + }) + ctx.db.collections.manifests.insert({ + key: `realtime-session:rt-1`, + kind: `realtime-session`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `active`, + startedAt: `2026-06-09T12:00:00.000Z`, + endedAt: null, + retention: `forever`, + streams: { + audio_in: `/test/entity/realtime/rt-1/audio/in`, + audio_out: `/test/entity/realtime/rt-1/audio/out`, + control_in: `/test/entity/realtime/rt-1/control/in`, + control_out: `/test/entity/realtime/rt-1/control/out`, + }, + }) + + durableMock.jsonStreams.set( + `http://server.test/test/entity/realtime/rt-1/control/in`, + [ + { type: `input_audio.commit`, afterAudioBytes: 9600 }, + { type: `session.close`, reason: `test` }, + ] + ) + + const commitInputAudio = vi.fn() + const close = vi.fn() + const realtime = ctx.useRealtime({ + systemPrompt: `You are realtime.`, + provider: { + id: `test`, + model: `test-realtime`, + connect: async () => ({ + commitInputAudio, + close, + events: (async function* () { + yield { type: `session.started` as const, sessionId: `rt-1` } + await new Promise((resolve) => setTimeout(resolve, 20)) + yield { type: `session.closed` as const } + })(), + }), + }, + tools: [], + audio: { + turnDetection: { type: `none` }, + }, + }) + + await realtime.run() + + expect(commitInputAudio).not.toHaveBeenCalled() + expect(close).toHaveBeenCalledWith(`test`) + }) +}) diff --git a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts index 89e3839644..c88a9ebad4 100644 --- a/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts +++ b/packages/agents-runtime/test/runtime-server-client-update-metadata.test.ts @@ -136,6 +136,84 @@ describe(`runtime-server-client.setTag`, () => { }) }) +describe(`runtime-server-client realtime sessions`, () => { + it(`starts a realtime session through the control-plane route`, async () => { + const calls: Array<{ url: string; init?: RequestInit }> = [] + const responseBody = { + sessionId: `rt-1`, + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + status: `requested`, + startedAt: `2026-06-09T10:00:00.000Z`, + streams: { + audio_in: `/horton/demo/realtime/rt-1/audio/in`, + audio_out: `/horton/demo/realtime/rt-1/audio/out`, + control_in: `/horton/demo/realtime/rt-1/control/in`, + control_out: `/horton/demo/realtime/rt-1/control/out`, + }, + } + const fakeFetch = vi.fn(async (url: string, init?: RequestInit) => { + calls.push({ url, init }) + return new Response(JSON.stringify(responseBody), { + status: 201, + headers: { 'content-type': `application/json` }, + }) + }) as unknown as typeof fetch + const client = createRuntimeServerClient({ + baseUrl: `http://test.example/t/tenant-a/v1`, + fetch: fakeFetch, + principalKey: `user:sam`, + }) + + await expect( + client.startRealtimeSession({ + entityUrl: `/horton/demo`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, + meta: { source: `button` }, + }) + ).resolves.toEqual(responseBody) + + expect(calls).toHaveLength(1) + expect(calls[0]!.url).toBe( + `http://test.example/t/tenant-a/v1/_electric/realtime/sessions` + ) + expect(calls[0]!.init?.method).toBe(`POST`) + const headers = new Headers(calls[0]!.init?.headers) + expect(headers.get(`content-type`)).toBe(`application/json`) + expect(headers.get(`electric-principal`)).toBe(`user:sam`) + expect(JSON.parse(calls[0]!.init!.body as string)).toEqual({ + entityUrl: `/horton/demo`, + id: `rt-1`, + provider: `openai`, + model: `gpt-realtime-2`, + inputAudio: { codec: `pcm16`, sampleRate: 16_000, channels: 1 }, + meta: { source: `button` }, + }) + }) + + it(`surfaces realtime session start failures`, async () => { + const fakeFetch = vi.fn( + async () => new Response(`not allowed`, { status: 401 }) + ) as unknown as typeof fetch + const client = createRuntimeServerClient({ + baseUrl: `http://test.example`, + fetch: fakeFetch, + }) + + await expect( + client.startRealtimeSession({ + entityUrl: `/horton/demo`, + provider: `openai`, + model: `gpt-realtime-2`, + }) + ).rejects.toThrow(/startRealtimeSession.*401.*not allowed/) + }) +}) + describe(`runtime-server-client event sources`, () => { it(`lists event sources from the runtime server`, async () => { const fakeFetch = vi.fn( diff --git a/packages/agents-runtime/test/timeline-context.test.ts b/packages/agents-runtime/test/timeline-context.test.ts index 0370ca1b1c..ec434cb326 100644 --- a/packages/agents-runtime/test/timeline-context.test.ts +++ b/packages/agents-runtime/test/timeline-context.test.ts @@ -6,6 +6,7 @@ import { import type { EntityStreamDB } from '../src/entity-stream-db' import type { IncludesInboxMessage, + IncludesRealtimeTranscript, IncludesRun, IncludesSignal, IncludesWakeMessage, @@ -172,6 +173,77 @@ describe(`timeline context`, () => { expect(result).toEqual([{ role: `user`, content: `updated text` }]) }) + it(`projects realtime input and output transcripts as chat messages`, () => { + const realtimeTranscripts: Array = [ + { + key: `rt-in`, + order: order(1), + session_id: `rt-1`, + direction: `input`, + text: `voice question`, + status: `final`, + audio_stream: `input`, + created_at: `2026-03-28T00:00:00.000Z`, + }, + { + key: `rt-out`, + order: order(2), + session_id: `rt-1`, + direction: `output`, + text: `voice answer`, + status: `final`, + audio_stream: `output`, + created_at: `2026-03-28T00:00:01.000Z`, + }, + ] + + expect( + buildTimelineMessages({ + runs: [], + inbox: [], + wakes: [], + realtimeTranscripts, + }) + ).toEqual([ + { role: `user`, content: `voice question` }, + { role: `assistant`, content: `voice answer` }, + ]) + }) + + it(`does not project partial realtime transcripts as chat messages`, () => { + const realtimeTranscripts: Array = [ + { + key: `rt-partial`, + order: order(1), + session_id: `rt-1`, + direction: `input`, + text: `partially heard`, + status: `partial`, + audio_stream: `input`, + created_at: `2026-03-28T00:00:00.000Z`, + }, + { + key: `rt-final`, + order: order(2), + session_id: `rt-1`, + direction: `input`, + text: `final question`, + status: `final`, + audio_stream: `input`, + created_at: `2026-03-28T00:00:01.000Z`, + }, + ] + + expect( + buildTimelineMessages({ + runs: [], + inbox: [], + wakes: [], + realtimeTranscripts, + }) + ).toEqual([{ role: `user`, content: `final question` }]) + }) + it(`buildTimelineMessages keeps pending tool calls without emitting tool results`, () => { expect( buildTimelineMessages({ @@ -494,6 +566,7 @@ describe(`timeline context`, () => { __electricRowOffsets: new Map([[`wake-1`, offset(7)]]), }, signals: { toArray: [], __electricRowOffsets: new Map() }, + realtimeTranscripts: { toArray: [], __electricRowOffsets: new Map() }, contextInserted: { toArray: [], __electricRowOffsets: new Map() }, contextRemoved: { toArray: [], __electricRowOffsets: new Map() }, manifests: { toArray: [], __electricRowOffsets: new Map() }, @@ -536,6 +609,7 @@ describe(`timeline context`, () => { inbox: { toArray: [] }, wakes: { toArray: [] }, signals: { toArray: [] }, + realtimeTranscripts: { toArray: [] }, contextInserted: { toArray: [] }, contextRemoved: { toArray: [] }, manifests: { toArray: [] }, diff --git a/packages/agents-runtime/tsdown.config.ts b/packages/agents-runtime/tsdown.config.ts index f2e095fd14..d0f378e6d7 100644 --- a/packages/agents-runtime/tsdown.config.ts +++ b/packages/agents-runtime/tsdown.config.ts @@ -8,6 +8,8 @@ const config: Options = { `src/sandbox.ts`, `src/sandbox-docker.ts`, `src/client.ts`, + `src/use-chat.ts`, + `src/use-chat-hook.ts`, // First-class entry so its .d.ts is stable (raced chunk fails dts gen in CI). `src/skills/types.ts`, ], diff --git a/packages/agents-server-ui/src/components/AgentResponse.tsx b/packages/agents-server-ui/src/components/AgentResponse.tsx index 312e5d0fc5..073343ecc4 100644 --- a/packages/agents-server-ui/src/components/AgentResponse.tsx +++ b/packages/agents-server-ui/src/components/AgentResponse.tsx @@ -403,10 +403,21 @@ export const AgentResponseLive = memo(function AgentResponseLive({ (q) => (run.errors ? q.from({ error: run.errors }) : undefined), [run.errors] ) + const { data: steps = [] } = useLiveQuery( + (q) => (run.steps ? q.from({ step: run.steps }) : undefined), + [run.steps] + ) const sortedItems = useMemo( () => [...items].sort(compareLiveRunItems), [items] ) + const isRealtimeRun = useMemo( + () => + (steps as Array<{ model_id?: string }>).some((step) => + step.model_id?.includes(`realtime`) + ), + [steps] + ) const contentItems = useMemo( () => liveRunItemsToContentItems(sortedItems), [sortedItems] @@ -479,6 +490,10 @@ export const AgentResponseLive = memo(function AgentResponseLive({ copiedTimerRef.current = setTimeout(() => setCopied(false), 1200) } + if (isRealtimeRun && sortedItems.length === 0 && !failureText) { + return <> + } + return ( {sortedItems.map((item, i) => { diff --git a/packages/agents-server-ui/src/components/EntityContextDrawer.tsx b/packages/agents-server-ui/src/components/EntityContextDrawer.tsx index 7f93db305d..72d1f04fee 100644 --- a/packages/agents-server-ui/src/components/EntityContextDrawer.tsx +++ b/packages/agents-server-ui/src/components/EntityContextDrawer.tsx @@ -570,6 +570,7 @@ function manifestKindLabel(manifest: Manifest): string { case `schedule`: return manifest.scheduleType === `cron` ? `Cron schedule` : `Future send` } + return manifest.kind } function createParentEntry(parent: DrawerEntity): DrawerEntry { @@ -707,6 +708,7 @@ function createManifestEntry( entity: null, } } + return null } function describeSourceConfig(config: unknown): string { diff --git a/packages/agents-server-ui/src/components/EntityTimeline.tsx b/packages/agents-server-ui/src/components/EntityTimeline.tsx index 214b806066..cdadff61f7 100644 --- a/packages/agents-server-ui/src/components/EntityTimeline.tsx +++ b/packages/agents-server-ui/src/components/EntityTimeline.tsx @@ -45,7 +45,7 @@ import { useCurrentPrincipal } from '../hooks/useCurrentPrincipal' import { Icon, IconButton, ScrollArea, Stack, Text, Tooltip } from '../ui' import { UserMessage } from './UserMessage' import type { ForkFromHereAction, UserMessageAttachment } from './UserMessage' -import { AgentResponseLive } from './AgentResponse' +import { AgentResponse, AgentResponseLive } from './AgentResponse' import { InlineEventCard } from './InlineEventCard' import { InlineStatusBadge } from './InlineStatusBadge' import { @@ -101,6 +101,20 @@ function readInboxPayloadDisplay(payload: unknown): string { return stringifyPayload(payload, 2) } +function isRealtimeSessionWake(row: RenderTimelineRow): boolean { + const changes = row.wake?.payload.changes + if (!Array.isArray(changes)) return false + return changes.some((change) => { + if (!change || typeof change !== `object`) return false + const payload = (change as { payload?: unknown }).payload + return ( + !!payload && + typeof payload === `object` && + (payload as { type?: unknown }).type === `realtime_session.started` + ) + }) +} + function stringifySearchPayload(value: unknown): string { if (value == null) return `` if (typeof value === `string`) return value @@ -229,6 +243,13 @@ function estimateRowHeight( ) return Math.max(64, 48 + lines * lineHeight) + timelineRowGap(row) } + if (row.realtimeTranscript) { + const lines = Math.max( + 1, + Math.ceil(row.realtimeTranscript.text.length / charsPerLine) + ) + return Math.max(64, 48 + lines * lineHeight) + timelineRowGap(row) + } if (row.wake || row.signal || row.manifest) { return 76 + timelineRowGap(row) } @@ -257,6 +278,7 @@ function timelineRowSearchText( runSearchTextByKey: Map ): string { if (row.inbox) return readInboxText(row.inbox.payload) + if (row.realtimeTranscript) return row.realtimeTranscript.text if (row.wake) { return wakeSectionText({ kind: `wake`, @@ -272,6 +294,7 @@ function timelineRowSearchText( function timelineRowLabel(row: RenderTimelineRow): string { if (row.inbox?.from_agent) return `Agent message` if (row.inbox) return `User message` + if (row.realtimeTranscript) return `Voice message` if (row.wake) return `Wake` if (row.signal) return `Signal` if (row.manifest) return `Manifest item` @@ -731,6 +754,7 @@ function manifestKindLabel(manifest: Manifest): string { case `schedule`: return `Schedule` } + return manifest.kind } function manifestTitle(manifest: Manifest): string { @@ -746,6 +770,7 @@ function manifestTitle(manifest: Manifest): string { case `schedule`: return manifest.id } + return manifest.key } function manifestMeta(manifest: Manifest): string { @@ -767,6 +792,7 @@ function manifestMeta(manifest: Manifest): string { ? `${manifest.expression}${manifest.timezone ? ` · ${manifest.timezone}` : ``}` : `${manifest.fireAt} · ${manifest.status}` } + return `` } function manifestDetails( @@ -825,6 +851,7 @@ function manifestDetails( { label: `Status`, value: manifest.status ?? `pending` }, ] } + return [] } function manifestIcon(manifest: Manifest) { @@ -965,6 +992,46 @@ const TimelineRow = memo(function TimelineRow({ ) } + if (row.realtimeTranscript) { + if (row.realtimeTranscript.text.trim().length === 0) { + return <> + } + const timestamp = Date.parse(row.realtimeTranscript.created_at) + if (row.realtimeTranscript.direction === `output`) { + const isStreamingTranscript = row.realtimeTranscript.status !== `final` + return ( + + ) + } + return ( + + ) + } + if (row.wake) { return ( (null) const textColumnWidth = Math.max(0, contentWidth - CHAT_SURFACE_GUTTER) const displayRows = useMemo( - () => rows.filter((row) => !isAttachmentManifest(row.manifest)), + () => + rows.filter( + (row) => + !isAttachmentManifest(row.manifest) && !isRealtimeSessionWake(row) + ), [rows] ) const attachmentsByInboxKey = useMemo(() => { @@ -1200,7 +1271,7 @@ export function EntityTimeline({ if (streamingIndex < 0) return null for (let index = streamingIndex - 1; index >= 0; index--) { const row = displayRows[index] - if (row?.inbox) { + if (row?.inbox || row?.realtimeTranscript) { return row.$key } } @@ -1217,6 +1288,9 @@ export function EntityTimeline({ if (row.inbox) { const timestamp = Date.parse(row.inbox.timestamp) lastUserTimestamp = Number.isFinite(timestamp) ? timestamp : null + } else if (row.realtimeTranscript) { + const timestamp = Date.parse(row.realtimeTranscript.created_at) + lastUserTimestamp = Number.isFinite(timestamp) ? timestamp : null } else if (row.run) { timestampByRowKey.set(row.$key, lastUserTimestamp) } diff --git a/packages/agents-server-ui/src/components/MessageInput.module.css b/packages/agents-server-ui/src/components/MessageInput.module.css index a9fa5e06f7..5ced0808eb 100644 --- a/packages/agents-server-ui/src/components/MessageInput.module.css +++ b/packages/agents-server-ui/src/components/MessageInput.module.css @@ -63,6 +63,43 @@ color: var(--ds-text-1); } +.inlineIconButton.voiceActive { + background: var(--ds-accent-a3); + color: var(--ds-accent-11); +} + +.voiceMeter { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 2px; + width: 0; + height: 20px; + color: var(--ds-accent-11); + opacity: 0; + overflow: hidden; + transition: + opacity 0.12s ease, + width 0.12s ease; +} + +.voiceMeter[data-active='true'] { + width: 18px; + opacity: 1; +} + +.voiceMeterBar { + display: block; + width: 3px; + height: 14px; + border-radius: var(--ds-radius-full); + background: currentColor; + transform-origin: center bottom; + transition: + opacity 0.08s linear, + transform 0.08s linear; +} + .inlineIconButton:focus-visible { outline: 2px solid var(--ds-accent-a6); outline-offset: -2px; diff --git a/packages/agents-server-ui/src/components/MessageInput.tsx b/packages/agents-server-ui/src/components/MessageInput.tsx index 74e79243ec..2ddf40a53c 100644 --- a/packages/agents-server-ui/src/components/MessageInput.tsx +++ b/packages/agents-server-ui/src/components/MessageInput.tsx @@ -1,5 +1,5 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react' -import { ArrowUp, Square } from 'lucide-react' +import { ArrowUp, AudioLines, Square } from 'lucide-react' import { useLiveQuery } from '@tanstack/react-db' import type { EntityStreamDBWithActions } from '@electric-ax/agents-runtime/client' import { @@ -10,6 +10,11 @@ import { readTextPayload, } from '../lib/sendMessage' import { serializeComposerInput } from '@electric-ax/agents-runtime/client' +import { + startRealtimeAudioSession, + type RealtimeAudioSession, +} from '../lib/realtime-audio' +import { useRealtimeAvailability } from '../hooks/useRealtimeAvailability' import { ComposerEditor } from './ComposerEditor' import { ComposerShell } from './ComposerShell' import { Icon, Stack, Text, Tooltip } from '../ui' @@ -45,6 +50,10 @@ export function MessageInput({ drawer, onSend, onStop, + autoStartRealtimeSignal, + autoStartRealtimeInitialText, + autoStartRealtimeGreetIfSilent = false, + onRealtimeAutoStartConsumed, }: { db: EntityStreamDBWithActions | null baseUrl: string @@ -62,6 +71,10 @@ export function MessageInput({ onOptimisticQueuedMessage?: (message: OptimisticInboxMessage) => void onSend?: () => void onStop?: () => void + autoStartRealtimeSignal?: string | null + autoStartRealtimeInitialText?: string + autoStartRealtimeGreetIfSilent?: boolean + onRealtimeAutoStartConsumed?: () => void /** * Optional content rendered above the composer, sharing its docked * width and lift into the timeline above. The composer is z-indexed @@ -85,7 +98,13 @@ export function MessageInput({ key: string originalText: string } | null>(null) + const [realtimePending, setRealtimePending] = useState(false) + const [realtimeActive, setRealtimeActive] = useState(false) + const [realtimeInputLevel, setRealtimeInputLevel] = useState(0) + const realtimeSessionRef = useRef(null) + const handledAutoStartRealtimeRef = useRef(null) const composerFocusRef = useRef<{ focus: () => void } | null>(null) + const realtimeAvailability = useRealtimeAvailability() const inputDisabled = disabled || writeDisabled const attachmentsDisabled = inputDisabled || Boolean(editingMessage) || !imageAttachmentsEnabled @@ -160,10 +179,24 @@ export function MessageInput({ !inputDisabled && !editingMessage && imageAttachmentsEnabled const showStop = generationActive && + !realtimeActive && inputText.length === 0 && attachmentCount === 0 && !disabled const canStop = showStop && !stopPending && !stopDisabled + const canStartRealtime = + !inputDisabled && + !editingMessage && + attachmentCount === 0 && + Boolean(baseUrl) && + realtimeAvailability.canStart + + useEffect(() => { + return () => { + void realtimeSessionRef.current?.stop() + realtimeSessionRef.current = null + } + }, []) const handleSubmit = useCallback( (composerPayload?: ComposerInputPayload) => { @@ -171,6 +204,16 @@ export function MessageInput({ setError(null) const text = value.trim() const files = imageAttachmentsEnabled ? attachments : [] + if (realtimeSessionRef.current && !editingMessage && files.length === 0) { + const session = realtimeSessionRef.current + setValue(``) + onSend?.() + session.sendText(text).catch((err: Error) => { + setError(err.message) + setValue((current) => (current ? current : text)) + }) + return + } const tx = editingMessage ? updateAction?.({ key: editingMessage.key, @@ -221,6 +264,102 @@ export function MessageInput({ handleSubmit() }, [canStop, handleSubmit, onStop]) + const startRealtimeSession = useCallback( + ({ + initialText, + greetIfSilent = false, + }: { initialText?: string; greetIfSilent?: boolean } = {}) => { + if (realtimePending) return + setError(null) + if (!canStartRealtime) { + if (realtimeAvailability.unavailableReason) { + setError(realtimeAvailability.unavailableReason) + } + return + } + setRealtimePending(true) + startRealtimeAudioSession({ + baseUrl, + entityUrl, + onInputLevel: setRealtimeInputLevel, + initialText, + greetIfSilent, + }) + .then((session) => { + realtimeSessionRef.current = session + setRealtimeActive(true) + }) + .catch((err: Error) => { + setError(err.message) + setRealtimeInputLevel(0) + }) + .finally(() => { + setRealtimePending(false) + }) + }, + [ + baseUrl, + canStartRealtime, + entityUrl, + realtimeAvailability.unavailableReason, + realtimePending, + ] + ) + + const handleRealtimeToggle = useCallback(() => { + if (realtimePending) return + setError(null) + if (realtimeSessionRef.current) { + const session = realtimeSessionRef.current + realtimeSessionRef.current = null + setRealtimePending(true) + session + .stop() + .catch((err: Error) => setError(err.message)) + .finally(() => { + setRealtimeActive(false) + setRealtimeInputLevel(0) + setRealtimePending(false) + }) + return + } + startRealtimeSession() + }, [realtimePending, startRealtimeSession]) + + useEffect(() => { + if (!autoStartRealtimeSignal) return + if (handledAutoStartRealtimeRef.current === autoStartRealtimeSignal) return + if (realtimeAvailability.loading || realtimePending) return + if (!realtimeAvailability.canStart) { + handledAutoStartRealtimeRef.current = autoStartRealtimeSignal + onRealtimeAutoStartConsumed?.() + if (realtimeAvailability.unavailableReason) { + setError(realtimeAvailability.unavailableReason) + } + return + } + if (!canStartRealtime) return + handledAutoStartRealtimeRef.current = autoStartRealtimeSignal + onRealtimeAutoStartConsumed?.() + if (!realtimeSessionRef.current) { + startRealtimeSession({ + initialText: autoStartRealtimeInitialText, + greetIfSilent: autoStartRealtimeGreetIfSilent, + }) + } + }, [ + autoStartRealtimeSignal, + autoStartRealtimeGreetIfSilent, + autoStartRealtimeInitialText, + canStartRealtime, + onRealtimeAutoStartConsumed, + realtimeAvailability.canStart, + realtimeAvailability.loading, + realtimeAvailability.unavailableReason, + realtimePending, + startRealtimeSession, + ]) + const startEditing = useCallback( (message: EntityTimelineData[`inbox`][number]) => { if (inputDisabled) return @@ -296,11 +435,26 @@ export function MessageInput({ ) const isButtonActive = canSubmit || (showStop && !stopDisabled) + const voiceLevel = realtimeActive ? realtimeInputLevel : 0 + const voiceBars = [ + Math.max(0.18, Math.min(1, 0.24 + voiceLevel * 0.76)), + Math.max(0.24, Math.min(1, 0.34 + voiceLevel * 0.9)), + Math.max(0.16, Math.min(1, 0.2 + voiceLevel * 0.82)), + ] const sendTooltip = showStop ? stopDisabled ? `Signal permission required` : `Stop generating` : `Send message` + const realtimeTooltip = realtimeActive + ? `Stop voice mode` + : attachmentCount > 0 + ? `Remove attachments to start voice mode` + : realtimeAvailability.loading + ? `Checking realtime credentials` + : (realtimeAvailability.unavailableReason ?? `Start voice mode`) + const realtimeButtonDisabled = + realtimePending || (!realtimeActive && !canStartRealtime) return ( {drawer?.({ @@ -349,15 +503,55 @@ export function MessageInput({ ) : null } controls={ - imageAttachmentsEnabled ? ( - - ) : null + <> + + + + + + + {imageAttachmentsEnabled ? ( + + ) : null} + } send={ diff --git a/packages/agents-server-ui/src/components/NewSessionPage.module.css b/packages/agents-server-ui/src/components/NewSessionPage.module.css index 70106efecc..241c4d76e1 100644 --- a/packages/agents-server-ui/src/components/NewSessionPage.module.css +++ b/packages/agents-server-ui/src/components/NewSessionPage.module.css @@ -470,6 +470,36 @@ display: inline-flex; } +.composerVoice { + all: unset; + display: inline-flex; + align-items: center; + justify-content: center; + width: 24px; + height: 24px; + border-radius: var(--ds-radius-full); + background: var(--ds-gray-a3); + color: var(--ds-text-3); + cursor: pointer; + transition: + background 0.12s ease, + color 0.12s ease, + opacity 0.12s ease; + flex-shrink: 0; +} +.composerVoice:hover:not(:disabled) { + background: var(--ds-gray-a4); + color: var(--ds-text-1); +} +.composerVoice:disabled { + cursor: not-allowed; + opacity: 0.55; +} +.composerVoicePending { + background: var(--ds-accent-a3); + color: var(--ds-accent-11); +} + .composerSend { all: unset; display: inline-flex; diff --git a/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx b/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx index adbb6d5c95..30019938fa 100644 --- a/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx +++ b/packages/agents-server-ui/src/components/settings/SettingsSidebar.tsx @@ -6,6 +6,7 @@ import { KeyRound, Palette, Plug, + RadioTower, Server, Settings as SettingsIcon, Terminal, @@ -21,6 +22,7 @@ export type SettingsCategoryId = | `account` | `servers` | `credentials` + | `realtime` | `command-line` | `appearance` | `local-runtime` @@ -105,6 +107,12 @@ export function SettingsSidebar({ icon: , visible: true, }, + { + id: `realtime`, + label: `Realtime`, + icon: , + visible: true, + }, { id: `command-line`, label: `Command Line`, diff --git a/packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css new file mode 100644 index 0000000000..f7681ab603 --- /dev/null +++ b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.module.css @@ -0,0 +1,61 @@ +.modelSelect { + min-width: 240px; +} + +.modelList { + display: flex; + flex-direction: column; + gap: 0; +} + +.modelItem { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 16px; + padding: 12px 0; + border-top: 1px solid var(--ds-border-1); +} + +.modelItem:first-child { + padding-top: 0; + border-top: 0; +} + +.modelItem:last-child { + padding-bottom: 0; +} + +.modelText { + min-width: 0; + display: flex; + flex-direction: column; + gap: 4px; +} + +.modelTitle { + display: inline-flex; + align-items: center; + gap: 6px; + min-width: 0; + color: var(--ds-text-1); + font-size: var(--ds-text-sm); +} + +.modelId { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + color: var(--ds-text-3); + font-size: var(--ds-text-xs); +} + +.modelDescription { + color: var(--ds-text-3); + font-size: var(--ds-text-xs); + line-height: 1.45; +} + +.recommended { + flex-shrink: 0; +} diff --git a/packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx new file mode 100644 index 0000000000..d465242059 --- /dev/null +++ b/packages/agents-server-ui/src/components/settings/pages/RealtimePage.tsx @@ -0,0 +1,388 @@ +import { useEffect, useMemo, useState } from 'react' +import { useNavigate } from '@tanstack/react-router' +import { + loadRealtimeSettingsStatus, + saveRealtimeSettings, + type RealtimeSettingsStatus, +} from '../../../lib/server-connection' +import { Button, Select, Switch, Text } from '../../../ui' +import { + SettingsPanel, + SettingsRow, + SettingsScreen, + SettingsSection, + SettingsStatusBadge, +} from '../SettingsScreen' +import styles from './RealtimePage.module.css' + +export function RealtimePage(): React.ReactElement { + const isDesktop = typeof window !== `undefined` && Boolean(window.electronAPI) + const navigate = useNavigate() + const [status, setStatus] = useState(null) + const [saving, setSaving] = useState(false) + const [error, setError] = useState(null) + + useEffect(() => { + let cancelled = false + void loadRealtimeSettingsStatus().then((next) => { + if (cancelled) return + setStatus(next) + }) + return () => { + cancelled = true + } + }, []) + + const modelById = useMemo( + () => new Map(status?.availableModels.map((model) => [model.id, model])), + [status?.availableModels] + ) + const voiceById = useMemo( + () => new Map(status?.availableVoices.map((voice) => [voice.id, voice])), + [status?.availableVoices] + ) + const reasoningEffortById = useMemo( + () => + new Map( + status?.availableReasoningEfforts.map((effort) => [effort.id, effort]) + ), + [status?.availableReasoningEfforts] + ) + const selectedModel = status ? modelById.get(status.settings.model) : null + const selectedVoice = status ? voiceById.get(status.settings.voice) : null + const selectedReasoningEffort = status + ? reasoningEffortById.get(status.settings.reasoningEffort) + : null + + const saveSettingsPatch = async ( + patch: Partial + ): Promise => { + if (!status) return + const next = { + ...status, + settings: { ...status.settings, ...patch }, + } + setStatus(next) + setSaving(true) + setError(null) + try { + await saveRealtimeSettings(next.settings) + } catch (err) { + setStatus(status) + setError(err instanceof Error ? err.message : String(err)) + } finally { + setSaving(false) + } + } + + return ( + + + {!isDesktop ? ( + + + Realtime settings are managed by the connected desktop or server + runtime. This web build uses the default model when starting a + session from the browser. + + + ) : !status ? ( + + + Loading… + + + ) : ( + <> + + + {authBadgeLabel(status)} + + + + } + /> + OpenAI + } + /> + { + if (model) void saveSettingsPatch({ model }) + }} + disabled={saving} + > + + model ? (modelById.get(model)?.label ?? model) : `Model` + } + /> + + {status.availableModels.map((model) => ( + + {model.label} + + ))} + + + } + /> + { + if (voice) void saveSettingsPatch({ voice }) + }} + disabled={saving} + > + + voice ? (voiceById.get(voice)?.label ?? voice) : `Voice` + } + /> + + {status.availableVoices.map((voice) => ( + + {voice.label} + + ))} + + + } + /> + { + if (reasoningEffort) { + void saveSettingsPatch({ + reasoningEffort: + reasoningEffort as RealtimeSettingsStatus[`settings`][`reasoningEffort`], + }) + } + }} + disabled={ + saving || status.settings.model !== `gpt-realtime-2` + } + > + + reasoningEffort + ? (reasoningEffortById.get( + reasoningEffort as RealtimeSettingsStatus[`settings`][`reasoningEffort`] + )?.label ?? reasoningEffort) + : `Effort` + } + /> + + {status.availableReasoningEfforts.map((effort) => ( + + {effort.label} + + ))} + + + } + /> + { + void saveSettingsPatch({ interruptResponse }) + }} + /> + } + /> + {saving && ( + + + Saving… + + + )} + {error && ( + + + {error} + + + )} + + )} + + + {status && ( + + +
+ {status.availableVoices.map((voice) => ( +
+
+ + {voice.label} + {voice.recommended && ( + + Recommended + + )} + + {voice.id} + + {voice.description} + +
+ {voice.id === status.settings.voice && ( + + + Selected + + + )} +
+ ))} +
+
+
+ )} + + {status && ( + + +
+ {status.availableModels.map((model) => ( +
+
+ + {model.label} + {model.recommended && ( + + Recommended + + )} + + {model.id} + + {model.description} + +
+ {model.id === status.settings.model && ( + + + Selected + + + )} +
+ ))} +
+
+
+ )} +
+ ) +} + +function authDescription(status: RealtimeSettingsStatus): string { + if (status.openAIApiKeyStatus === `valid`) { + return `Realtime sessions connect to the OpenAI Realtime API with your OpenAI API key.` + } + if (status.openAIApiKeyStatus === `invalid`) { + return ( + status.openAIApiKeyError ?? + `The configured OpenAI API key could not be used for realtime audio.` + ) + } + if (status.openAIApiKeyStatus === `unknown`) { + return ( + status.openAIApiKeyError ?? + `Unable to verify realtime API access right now.` + ) + } + if (status.codexEnabled) { + return `ChatGPT / Codex sign-in is enabled, but realtime voice still needs an OpenAI API key.` + } + return `Add an OpenAI API key in Credentials. ChatGPT / Codex sign-in alone does not grant Realtime API access.` +} + +function authBadgeLabel(status: RealtimeSettingsStatus): string { + switch (status.openAIApiKeyStatus) { + case `valid`: + return `Ready` + case `invalid`: + return `Invalid key` + case `unknown`: + return status.hasOpenAIApiKey ? `Verify failed` : `Checking` + case `missing`: + return `API key required` + } +} diff --git a/packages/agents-server-ui/src/components/views/ChatView.tsx b/packages/agents-server-ui/src/components/views/ChatView.tsx index 0f64c6c239..65d6fee95c 100644 --- a/packages/agents-server-ui/src/components/views/ChatView.tsx +++ b/packages/agents-server-ui/src/components/views/ChatView.tsx @@ -12,6 +12,7 @@ import { useEntityPermissions, type EntityPermission, } from '../../hooks/useEntityPermission' +import { useWorkspace } from '../../hooks/useWorkspace' import type { ViewProps } from '../../lib/workspace/viewRegistry' import type { EntityTimelineQueryRow } from '@electric-ax/agents-runtime/client' import type { EventPointer } from '@electric-ax/agents-runtime' @@ -24,6 +25,8 @@ const CHAT_VIEW_PERMISSIONS: ReadonlyArray = [ `signal`, `fork`, ] +const REALTIME_INITIAL_TEXT_VIEW_PARAM = `realtimeInitialText` +const REALTIME_GREET_VIEW_PARAM = `realtimeGreet` /** * The default view: chat / timeline + message composer. @@ -40,6 +43,7 @@ export function ChatView({ entityStopped, isSpawning, tileId, + viewParams, }: ViewProps): React.ReactElement { // While `spawning`, the entity has no inbox yet — `connectUrl` is null // so `useEntityTimeline` doesn't try to subscribe and we render an empty @@ -54,6 +58,7 @@ export function ChatView({ entityStopped={entityStopped} isSpawning={isSpawning} tileId={tileId} + viewParams={viewParams} /> ) } @@ -178,6 +183,7 @@ function GenericChatBody({ entityStopped, isSpawning, tileId, + viewParams, }: { baseUrl: string entityUrl: string | null @@ -185,6 +191,7 @@ function GenericChatBody({ entityStopped: boolean isSpawning: boolean tileId: string + viewParams?: ViewProps[`viewParams`] }): React.ReactElement { const { timelineRows, @@ -197,6 +204,7 @@ function GenericChatBody({ } = useEntityTimeline(baseUrl || null, entityUrl) const { signalEntity, forkEntity, entityTypesCollection } = useElectricAgents() + const { helpers } = useWorkspace() const permissions = useEntityPermissions(entity, CHAT_VIEW_PERMISSIONS) const canWrite = permissions.write const canSignal = permissions.signal @@ -282,6 +290,37 @@ function GenericChatBody({ setStopPending(false) }, [entityUrl]) + const autoStartRealtimeSignal = + viewParams?.realtime === `start` && entityUrl + ? [ + entityUrl, + `realtime`, + `start`, + viewParams[REALTIME_INITIAL_TEXT_VIEW_PARAM] ?? ``, + viewParams[REALTIME_GREET_VIEW_PARAM] ?? ``, + ].join(`:`) + : null + const autoStartRealtimeInitialText = + viewParams?.realtime === `start` + ? viewParams[REALTIME_INITIAL_TEXT_VIEW_PARAM] + : undefined + const autoStartRealtimeGreetIfSilent = + viewParams?.realtime === `start` && + viewParams[REALTIME_GREET_VIEW_PARAM] === `1` + const handleRealtimeAutoStartConsumed = useCallback(() => { + const nextParams = Object.fromEntries( + Object.entries(viewParams ?? {}).filter( + ([key]) => + key !== `realtime` && + key !== REALTIME_INITIAL_TEXT_VIEW_PARAM && + key !== REALTIME_GREET_VIEW_PARAM + ) + ) + helpers.setTileView(tileId, `chat`, { + viewParams: Object.keys(nextParams).length > 0 ? nextParams : undefined, + }) + }, [helpers, tileId, viewParams]) + const stopGeneration = useCallback(() => { if (!canSignal) return if (!entityUrl || !signalEntity || !generationActive || stopPending) return @@ -399,6 +438,10 @@ function GenericChatBody({ )} onSend={() => setSentMessageSignal((value) => value + 1)} onStop={stopGeneration} + autoStartRealtimeSignal={autoStartRealtimeSignal} + autoStartRealtimeInitialText={autoStartRealtimeInitialText} + autoStartRealtimeGreetIfSilent={autoStartRealtimeGreetIfSilent} + onRealtimeAutoStartConsumed={handleRealtimeAutoStartConsumed} /> ) diff --git a/packages/agents-server-ui/src/components/views/NewSessionView.tsx b/packages/agents-server-ui/src/components/views/NewSessionView.tsx index a7ab1ba345..8f6aef7ea3 100644 --- a/packages/agents-server-ui/src/components/views/NewSessionView.tsx +++ b/packages/agents-server-ui/src/components/views/NewSessionView.tsx @@ -1,6 +1,7 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react' import { ArrowUp, + AudioLines, Check, ChevronDown, ChevronRight, @@ -12,6 +13,7 @@ import { COMPOSER_INPUT_MESSAGE_TYPE } from '@electric-ax/agents-runtime/client' import { nanoid } from 'nanoid' import { useElectricAgents } from '../../lib/ElectricAgentsProvider' import { useWorkspace } from '../../hooks/useWorkspace' +import { useRealtimeAvailability } from '../../hooks/useRealtimeAvailability' import { recentWorkingDirsForRunner } from '../../lib/recentWorkingDirectories' import { isSandboxProfileRemote, @@ -55,6 +57,7 @@ import type { SlashCommandRow, } from '@electric-ax/agents-runtime/client' import type { StandaloneViewProps } from '../../lib/workspace/viewRegistry' +import type { TileViewParams } from '../../lib/workspace/types' /** * The "default agent" — when an entity type with this name is registered @@ -62,6 +65,9 @@ import type { StandaloneViewProps } from '../../lib/workspace/viewRegistry' * so the most common flow is one keystroke away. */ const DEFAULT_AGENT_NAME = `horton` +const REALTIME_AUTOSTART_VIEW_PARAMS: TileViewParams = { realtime: `start` } +const REALTIME_INITIAL_TEXT_VIEW_PARAM = `realtimeInitialText` +const REALTIME_GREET_VIEW_PARAM = `realtimeGreet` const HERO_TITLES = [ `Let’s ship`, @@ -342,7 +348,8 @@ export function NewSessionView({ initialMessage?: unknown, initialMessageType?: string, initialAttachments?: Array, - sandboxProfile?: string | null + sandboxProfile?: string | null, + viewParams?: TileViewParams ): Promise => { if (!spawnEntity) return false setError(null) @@ -400,6 +407,7 @@ export function NewSessionView({ } helpers.openEntity(entityUrl, { target: { tileId, position: `replace` }, + ...(viewParams ? { viewParams } : {}), }) return true } catch (err) { @@ -448,20 +456,15 @@ export function NewSessionView({ return () => setToolbarTitle(null) }, [handleCancelSelected, selected, setToolbarTitle]) - const handleStartDefault = useCallback( - async ( - input: string | ComposerInputPayload, + const prepareDefaultAgentArgs = useCallback( + ( args: Record, - attachments: Array, sandboxProfile: string | null - ): Promise => { - if (!defaultAgent) return false - // Inject the picker's choice into the spawn args for the composer flow - // only — non-default agents have their own schemas and may not - // understand `workingDirectory`. A remote sandbox runs in the provider - // VM, so a host working directory is meaningless there: skip it for - // remote profiles. The spawned session itself becomes the newest - // synced recent for this runner. + ): Record => { + // Inject the picker's choice into the spawn args for the default-agent + // composer only — non-default agents have their own schemas and may not + // understand `workingDirectory`. Remote sandboxes run in provider VMs, so + // host paths are meaningless there. const profileIsRemote = isSandboxProfileRemote( allSandboxProfiles, sandboxProfile @@ -470,7 +473,20 @@ export function NewSessionView({ // factory — require a (non-remote) profile or the arg is a no-op. const includeWorkingDir = workingDirectory !== null && sandboxProfile !== null && !profileIsRemote - const augmented = includeWorkingDir ? { ...args, workingDirectory } : args + return includeWorkingDir ? { ...args, workingDirectory } : args + }, + [allSandboxProfiles, workingDirectory] + ) + + const handleStartDefault = useCallback( + async ( + input: string | ComposerInputPayload, + args: Record, + attachments: Array, + sandboxProfile: string | null + ): Promise => { + if (!defaultAgent) return false + const augmented = prepareDefaultAgentArgs(args, sandboxProfile) const hasAttachments = attachments.length > 0 const initialMessage = typeof input === `string` @@ -491,7 +507,35 @@ export function NewSessionView({ sandboxProfile ) }, - [defaultAgent, doSpawn, workingDirectory, allSandboxProfiles] + [defaultAgent, doSpawn, prepareDefaultAgentArgs] + ) + + const handleStartDefaultRealtime = useCallback( + async ( + input: string, + args: Record, + sandboxProfile: string | null + ): Promise => { + if (!defaultAgent) return false + const augmented = prepareDefaultAgentArgs(args, sandboxProfile) + const initialText = input.trim() + const viewParams: TileViewParams = { + ...REALTIME_AUTOSTART_VIEW_PARAMS, + ...(initialText + ? { [REALTIME_INITIAL_TEXT_VIEW_PARAM]: initialText } + : { [REALTIME_GREET_VIEW_PARAM]: `1` }), + } + return await doSpawn( + defaultAgent.name, + augmented, + undefined, + undefined, + undefined, + sandboxProfile, + viewParams + ) + }, + [defaultAgent, doSpawn, prepareDefaultAgentArgs] ) const defaultComposerReady = @@ -529,6 +573,7 @@ export function NewSessionView({ defaultAgentSandboxProfiles={defaultAgent ? allSandboxProfiles : []} onSelectType={handleSelectType} onStartDefault={handleStartDefault} + onStartDefaultRealtime={handleStartDefaultRealtime} spawnReady={Boolean(spawnEntity)} defaultComposerReady={defaultComposerReady} error={error} @@ -551,6 +596,7 @@ function Picker({ defaultAgentSandboxProfiles, onSelectType, onStartDefault, + onStartDefaultRealtime, spawnReady, defaultComposerReady, error, @@ -571,6 +617,11 @@ function Picker({ attachments: Array, sandboxProfile: string | null ) => Promise + onStartDefaultRealtime: ( + input: string, + args: Record, + sandboxProfile: string | null + ) => Promise spawnReady: boolean defaultComposerReady: boolean error: string | null @@ -606,6 +657,7 @@ function Picker({ agent={defaultAgent} sandboxProfiles={defaultAgentSandboxProfiles} onSubmit={onStartDefault} + onStartRealtime={onStartDefaultRealtime} disabled={!defaultComposerReady} workingDirectory={workingDirectory} onChangeWorkingDirectory={onChangeWorkingDirectory} @@ -925,6 +977,7 @@ function DefaultAgentComposer({ agent, sandboxProfiles, onSubmit, + onStartRealtime, disabled, workingDirectory, onChangeWorkingDirectory, @@ -941,6 +994,11 @@ function DefaultAgentComposer({ attachments: Array, sandboxProfile: string | null ) => Promise + onStartRealtime: ( + input: string, + args: Record, + sandboxProfile: string | null + ) => Promise disabled?: boolean workingDirectory: string | null onChangeWorkingDirectory: (path: string | null) => void @@ -960,8 +1018,13 @@ function DefaultAgentComposer({ [sandboxProfiles, selectedSandboxProfile] ) const [value, setValue] = useState(``) - const [submitting, setSubmitting] = useState(false) + const [submittingMode, setSubmittingMode] = useState< + `message` | `realtime` | null + >(null) + const submitting = submittingMode !== null + const realtimeSubmitting = submittingMode === `realtime` const composerFocusRef = useRef<{ focus: () => void } | null>(null) + const realtimeAvailability = useRealtimeAvailability() const inlineProps = useMemo( () => inlineSchemaProperties(agent.creation_schema), [agent.creation_schema] @@ -1059,7 +1122,7 @@ function DefaultAgentComposer({ payload ?? serializeComposerInput(value, slashCommands) const trimmed = nextPayload.source.trim() if ((!trimmed && files.length === 0) || disabled || submitting) return - setSubmitting(true) + setSubmittingMode(`message`) const cleaned: Record = {} for (const [k, v] of Object.entries(args)) { if (v !== undefined && v !== ``) cleaned[k] = v @@ -1078,7 +1141,7 @@ function DefaultAgentComposer({ }) .catch(() => undefined) .finally(() => { - setSubmitting(false) + setSubmittingMode(null) }) }, [ @@ -1095,6 +1158,40 @@ function DefaultAgentComposer({ ] ) + const startRealtime = useCallback(() => { + const files = imageAttachmentsEnabled ? attachments : [] + if (disabled || submitting || files.length > 0) return + if (!realtimeAvailability.canStart) return + const initialText = serializeComposerInput( + value, + slashCommands + ).source.trim() + setSubmittingMode(`realtime`) + const cleaned: Record = {} + for (const [k, v] of Object.entries(args)) { + if (v !== undefined && v !== ``) cleaned[k] = v + } + void onStartRealtime(initialText, cleaned, selectedSandboxProfile) + .then((ok) => { + if (ok) setValue(``) + }) + .catch(() => undefined) + .finally(() => { + setSubmittingMode(null) + }) + }, [ + args, + attachments, + disabled, + imageAttachmentsEnabled, + onStartRealtime, + realtimeAvailability.canStart, + selectedSandboxProfile, + slashCommands, + submitting, + value, + ]) + const attachmentCount = imageAttachmentsEnabled ? attachments.length : 0 const isActive = Boolean( (value.trim() || attachmentCount > 0) && !disabled && !submitting @@ -1103,6 +1200,19 @@ function DefaultAgentComposer({ const sendTooltip = submitting ? `Starting ${agent.name} session` : `Start ${agent.name} session` + const realtimeTooltip = + attachmentCount > 0 + ? `Remove attachments to start voice mode` + : realtimeSubmitting + ? `Starting voice session` + : realtimeAvailability.loading + ? `Checking realtime credentials` + : (realtimeAvailability.unavailableReason ?? `Start voice session`) + const realtimeDisabled = + disabled || + submitting || + attachmentCount > 0 || + !realtimeAvailability.canStart return (
{submitting && ( - Starting… + + {realtimeSubmitting ? `Starting voice…` : `Starting…`} + )} + + + + +