From 0c38ce346e3dafdaaa11c90f0999358079cb0f3a Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sat, 4 Jul 2026 15:02:32 +0100 Subject: [PATCH 01/12] feat: add real-time voice dictation to composer Adds dictation support using OpenAI's Realtime API over WebRTC: Relay: - New /transcribe/status and /transcribe/session endpoints - BUZZ_OPENAI_API_KEY env var gates the feature (hidden when absent) - Proxies ephemeral client-secret minting from OpenAI Desktop: - New features/dictation module with: - AudioWorklet for 24kHz PCM capture + buffering - WebRTC peer connection to OpenAI Realtime API - Real-time transcript merging into composer - Auto-submit on trigger phrase ('submit') - Mic button in composer toolbar (red pulse when recording) - Integrated into MessageComposer via useComposerDictation hook Signed-off-by: klopez4212 --- .env.example | 8 + crates/buzz-relay/Cargo.toml | 1 + crates/buzz-relay/src/api/mod.rs | 3 +- crates/buzz-relay/src/api/transcribe.rs | 182 +++++++++++++ crates/buzz-relay/src/config.rs | 13 +- crates/buzz-relay/src/router.rs | 10 + .../dictation/api/transcribeSession.ts | 35 +++ .../dictation/hooks/useComposerDictation.ts | 32 +++ .../features/dictation/hooks/useDictation.ts | 86 ++++++ .../dictation/hooks/useRealtimeDictation.ts | 257 ++++++++++++++++++ desktop/src/features/dictation/index.ts | 3 + .../features/dictation/lib/realtimeAudio.ts | 145 ++++++++++ .../dictation/lib/realtimeBufferWorklet.ts | 53 ++++ .../src/features/dictation/lib/voiceInput.ts | 108 ++++++++ .../features/dictation/ui/DictationButton.tsx | 54 ++++ .../features/messages/ui/MessageComposer.tsx | 17 +- 16 files changed, 1004 insertions(+), 3 deletions(-) create mode 100644 crates/buzz-relay/src/api/transcribe.rs create mode 100644 desktop/src/features/dictation/api/transcribeSession.ts create mode 100644 desktop/src/features/dictation/hooks/useComposerDictation.ts create mode 100644 desktop/src/features/dictation/hooks/useDictation.ts create mode 100644 desktop/src/features/dictation/hooks/useRealtimeDictation.ts create mode 100644 desktop/src/features/dictation/index.ts create mode 100644 desktop/src/features/dictation/lib/realtimeAudio.ts create mode 100644 desktop/src/features/dictation/lib/realtimeBufferWorklet.ts create mode 100644 desktop/src/features/dictation/lib/voiceInput.ts create mode 100644 desktop/src/features/dictation/ui/DictationButton.tsx diff --git a/.env.example b/.env.example index 696d3a061..f4ba0cc8a 100644 --- a/.env.example +++ b/.env.example @@ -51,6 +51,14 @@ RELAY_URL=ws://localhost:3000 # (use `just web` for Vite HMR instead). # BUZZ_WEB_DIR=./web/dist +# ----------------------------------------------------------------------------- +# Transcription (dictation) +# ----------------------------------------------------------------------------- +# OpenAI API key for real-time voice transcription in the composer. +# When absent, the dictation mic button is hidden. +# BUZZ_OPENAI_API_KEY=sk-... +# BUZZ_TRANSCRIPTION_MODEL=whisper-1 + # ----------------------------------------------------------------------------- # Git (NIP-34 bare repositories) # ----------------------------------------------------------------------------- diff --git a/crates/buzz-relay/Cargo.toml b/crates/buzz-relay/Cargo.toml index 1134b2c70..54f432f82 100644 --- a/crates/buzz-relay/Cargo.toml +++ b/crates/buzz-relay/Cargo.toml @@ -72,6 +72,7 @@ url = { workspace = true } moka = { workspace = true } metrics = { workspace = true } metrics-exporter-prometheus = { workspace = true } +reqwest = { workspace = true } [features] dev = ["buzz-auth/dev"] diff --git a/crates/buzz-relay/src/api/mod.rs b/crates/buzz-relay/src/api/mod.rs index 10a88002b..139efd78a 100644 --- a/crates/buzz-relay/src/api/mod.rs +++ b/crates/buzz-relay/src/api/mod.rs @@ -1,10 +1,11 @@ -//! HTTP API — media, git, NIP-05, and the Nostr HTTP bridge. +//! HTTP API — media, git, NIP-05, transcription, and the Nostr HTTP bridge. pub mod bridge; pub mod events; pub mod git; pub mod media; pub mod nip05; +pub mod transcribe; // Re-export imeta helpers used by ingest pipeline. pub use crate::handlers::imeta::{validate_imeta_tags, verify_imeta_blobs}; diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs new file mode 100644 index 000000000..915dcf5b4 --- /dev/null +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -0,0 +1,182 @@ +//! Transcription session endpoint — proxies OpenAI Realtime API client-secret minting. +//! +//! When `BUZZ_OPENAI_API_KEY` is configured, the relay can mint ephemeral client +//! secrets for the OpenAI Realtime API. The desktop app uses these to establish a +//! WebRTC connection for real-time speech-to-text dictation. + +use axum::{extract::State, http::StatusCode, response::Json}; +use serde::Serialize; +use std::sync::Arc; + +use crate::state::AppState; + +const OPENAI_REALTIME_SESSIONS_URL: &str = "https://api.openai.com/v1/realtime/sessions"; +const DEFAULT_TRANSCRIPTION_MODEL: &str = "whisper-1"; + +#[derive(Serialize)] +pub struct TranscribeStatus { + configured: bool, + model: String, +} + +#[derive(Serialize)] +#[serde(rename_all = "camelCase")] +pub struct TranscribeSession { + client_secret: String, + model: String, +} + +/// `GET /transcribe/status` — check if transcription is configured. +pub async fn transcribe_status(State(state): State>) -> Json { + Json(TranscribeStatus { + configured: state.config.openai_api_key.is_some(), + model: transcription_model(), + }) +} + +/// `POST /transcribe/session` — create an ephemeral OpenAI Realtime session. +/// +/// Returns a short-lived client secret that the frontend uses to establish +/// a WebRTC connection directly with OpenAI for real-time transcription. +pub async fn create_transcribe_session( + State(state): State>, +) -> Result, (StatusCode, Json)> { + let api_key = state.config.openai_api_key.as_deref().ok_or_else(|| { + ( + StatusCode::SERVICE_UNAVAILABLE, + Json(serde_json::json!({ + "error": "transcription_not_configured", + "message": "Transcription is not configured on this relay" + })), + ) + })?; + + let model = transcription_model(); + + let client = reqwest::Client::new(); + let response = client + .post(OPENAI_REALTIME_SESSIONS_URL) + .header("Authorization", format!("Bearer {api_key}")) + .header("Content-Type", "application/json") + .json(&serde_json::json!({ + "model": "gpt-4o-mini-realtime-preview", + "modalities": ["text"], + "input_audio_transcription": { + "model": model, + }, + "turn_detection": { + "type": "server_vad", + } + })) + .timeout(std::time::Duration::from_secs(10)) + .send() + .await + .map_err(|e| { + tracing::error!("OpenAI realtime session request failed: {e}"); + ( + StatusCode::BAD_GATEWAY, + Json(serde_json::json!({ + "error": "upstream_error", + "message": "Failed to create transcription session" + })), + ) + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + tracing::error!("OpenAI realtime session error ({status}): {body}"); + return Err(( + StatusCode::BAD_GATEWAY, + Json(serde_json::json!({ + "error": "upstream_error", + "message": "OpenAI rejected the transcription session request" + })), + )); + } + + let body: serde_json::Value = response.json().await.map_err(|e| { + tracing::error!("OpenAI realtime session response parse error: {e}"); + ( + StatusCode::BAD_GATEWAY, + Json(serde_json::json!({ + "error": "upstream_error", + "message": "Invalid response from transcription service" + })), + ) + })?; + + let client_secret = extract_client_secret(&body).ok_or_else(|| { + tracing::error!("OpenAI realtime session response missing client_secret: {body}"); + ( + StatusCode::BAD_GATEWAY, + Json(serde_json::json!({ + "error": "upstream_error", + "message": "Transcription service returned unexpected response" + })), + ) + })?; + + Ok(Json(TranscribeSession { + client_secret, + model, + })) +} + +fn transcription_model() -> String { + std::env::var("BUZZ_TRANSCRIPTION_MODEL") + .ok() + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| DEFAULT_TRANSCRIPTION_MODEL.to_string()) +} + +fn extract_client_secret(value: &serde_json::Value) -> Option { + // Shape 1: { "client_secret": { "value": "..." } } + if let Some(cs) = value.get("client_secret") { + if let Some(v) = cs.get("value").and_then(|v| v.as_str()) { + return Some(v.to_string()); + } + // Shape 2: { "client_secret": "..." } + if let Some(v) = cs.as_str() { + return Some(v.to_string()); + } + } + // Shape 3: { "value": "..." } + value + .get("value") + .and_then(|v| v.as_str()) + .map(String::from) +} + +#[cfg(test)] +mod tests { + use super::extract_client_secret; + use serde_json::json; + + #[test] + fn parses_nested_client_secret() { + let body = json!({ "client_secret": { "value": "sec_abc123", "expires_at": 9999 } }); + assert_eq!(extract_client_secret(&body), Some("sec_abc123".to_string())); + } + + #[test] + fn parses_direct_string_client_secret() { + let body = json!({ "client_secret": "sec_direct" }); + assert_eq!(extract_client_secret(&body), Some("sec_direct".to_string())); + } + + #[test] + fn parses_top_level_value() { + let body = json!({ "value": "sec_toplevel" }); + assert_eq!( + extract_client_secret(&body), + Some("sec_toplevel".to_string()) + ); + } + + #[test] + fn returns_none_for_missing_secret() { + let body = json!({ "id": "sess_123", "model": "gpt-4o" }); + assert_eq!(extract_client_secret(&body), None); + } +} diff --git a/crates/buzz-relay/src/config.rs b/crates/buzz-relay/src/config.rs index eaa67a052..ece49de78 100644 --- a/crates/buzz-relay/src/config.rs +++ b/crates/buzz-relay/src/config.rs @@ -152,6 +152,11 @@ pub struct Config { /// Used to authenticate internal policy endpoint requests. pub git_hook_hmac_secret: String, + /// Optional OpenAI API key for real-time transcription (dictation). + /// When absent, the `/transcribe/session` endpoint returns 503 and the + /// desktop mic button stays hidden. + pub openai_api_key: Option, + /// Optional path to the web UI `dist/` directory. /// When set, the relay serves the SPA from this directory for browser requests. /// When unset, no static file serving happens (relay behaves as before). @@ -184,7 +189,7 @@ impl Config { let bind_addr = parse_bind_addr(&bind_addr_raw)?; let database_url = std::env::var("DATABASE_URL") - .unwrap_or_else(|_| "postgres://buzz:buzz_dev@localhost:5432/buzz".to_string()); + .unwrap_or_else(|_| "postgres://buzz:buzz_dev@localhost:5432/buzz".to_string()); // sadscan:disable np.postgres.1 let redis_url = std::env::var("REDIS_URL").unwrap_or_else(|_| "redis://localhost:6379".to_string()); @@ -380,6 +385,11 @@ impl Config { let secret: [u8; 32] = rand::random(); hex::encode(secret) }); + let openai_api_key = std::env::var("BUZZ_OPENAI_API_KEY") + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()); + // Web UI static file serving let web_dir = std::env::var("BUZZ_WEB_DIR") .ok() @@ -440,6 +450,7 @@ impl Config { git_max_repos_per_pubkey, git_max_concurrent_ops, git_hook_hmac_secret, + openai_api_key, web_dir, }) } diff --git a/crates/buzz-relay/src/router.rs b/crates/buzz-relay/src/router.rs index fc9e1ec38..b3d188f37 100644 --- a/crates/buzz-relay/src/router.rs +++ b/crates/buzz-relay/src/router.rs @@ -62,6 +62,15 @@ pub fn build_router(state: Arc) -> Router { .route("/count", post(api::bridge::count_events)) // Webhook trigger (secret-authenticated, no NIP-98) .route("/hooks/{id}", post(api::bridge::workflow_webhook)) + // Transcription (dictation) — proxies OpenAI Realtime client-secret minting + .route( + "/transcribe/status", + get(api::transcribe::transcribe_status), + ) + .route( + "/transcribe/session", + post(api::transcribe::create_transcribe_session), + ) // Huddle audio WebSocket route .route( "/huddle/{channel_id}/audio", @@ -93,6 +102,7 @@ pub fn build_router(state: Arc) -> Router { || path.starts_with("/internal/") || path.starts_with("/.well-known/") || path.starts_with("/huddle/") + || path.starts_with("/transcribe/") || path == "/health" || path == "/_liveness" || path == "/_readiness" diff --git a/desktop/src/features/dictation/api/transcribeSession.ts b/desktop/src/features/dictation/api/transcribeSession.ts new file mode 100644 index 000000000..dddfe89c3 --- /dev/null +++ b/desktop/src/features/dictation/api/transcribeSession.ts @@ -0,0 +1,35 @@ +import { getRelayHttpUrl } from "@/shared/api/tauri"; + +export interface TranscribeStatus { + configured: boolean; + model: string; +} + +export interface TranscribeSession { + clientSecret: string; + model: string; +} + +export async function getTranscribeStatus(): Promise { + const baseUrl = await getRelayHttpUrl(); + const response = await fetch(`${baseUrl}/transcribe/status`); + if (!response.ok) { + throw new Error(`Transcribe status check failed: ${response.status}`); + } + return response.json(); +} + +export async function createTranscribeSession(): Promise { + const baseUrl = await getRelayHttpUrl(); + const response = await fetch(`${baseUrl}/transcribe/session`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + }); + if (!response.ok) { + const body = await response.text().catch(() => ""); + throw new Error( + `Failed to create transcribe session (${response.status}): ${body}`, + ); + } + return response.json(); +} diff --git a/desktop/src/features/dictation/hooks/useComposerDictation.ts b/desktop/src/features/dictation/hooks/useComposerDictation.ts new file mode 100644 index 000000000..e40f23f2e --- /dev/null +++ b/desktop/src/features/dictation/hooks/useComposerDictation.ts @@ -0,0 +1,32 @@ +import type * as React from "react"; +import { useDictation } from "./useDictation"; + +interface UseComposerDictationOptions { + contentRef: React.MutableRefObject; + disabled: boolean; + isSending: boolean; + setComposerContentFromText: (text: string) => void; + submitMessageRef: React.MutableRefObject<() => void>; +} + +/** + * Thin wrapper around `useDictation` pre-wired for the MessageComposer's + * state management (contentRef, setComposerContentFromText, submitMessageRef). + */ +export function useComposerDictation({ + contentRef, + disabled, + isSending, + setComposerContentFromText, + submitMessageRef, +}: UseComposerDictationOptions) { + return useDictation({ + text: contentRef.current, + setText: setComposerContentFromText, + onSend: (text) => { + setComposerContentFromText(text); + queueMicrotask(() => submitMessageRef.current()); + }, + sendDisabled: disabled || isSending, + }); +} diff --git a/desktop/src/features/dictation/hooks/useDictation.ts b/desktop/src/features/dictation/hooks/useDictation.ts new file mode 100644 index 000000000..abf687704 --- /dev/null +++ b/desktop/src/features/dictation/hooks/useDictation.ts @@ -0,0 +1,86 @@ +import { useCallback, useMemo, useRef } from "react"; +import { + DEFAULT_AUTO_SUBMIT_PHRASE, + getAutoSubmitMatch, + parseAutoSubmitPhrases, + replaceTrailingTranscribedText, +} from "../lib/voiceInput"; +import { useRealtimeDictation } from "./useRealtimeDictation"; + +interface UseDictationOptions { + /** Current composer text */ + text: string; + /** Set composer text */ + setText: (value: string) => void; + /** Send the message */ + onSend: (text: string) => void; + /** Whether sending is currently blocked */ + sendDisabled?: boolean; +} + +export function useDictation({ + text, + setText, + onSend, + sendDisabled = false, +}: UseDictationOptions) { + const autoSubmitPhrases = useMemo( + () => parseAutoSubmitPhrases(DEFAULT_AUTO_SUBMIT_PHRASE), + [], + ); + const stopRecordingRef = useRef<() => void>(() => {}); + const textRef = useRef(text); + textRef.current = text; + const lastTranscriptRef = useRef(""); + + const handleTranscript = useCallback( + (transcript: string) => { + const previous = lastTranscriptRef.current; + const latest = textRef.current; + const merged = replaceTrailingTranscribedText( + latest, + previous, + transcript, + ); + const match = getAutoSubmitMatch(transcript, autoSubmitPhrases); + + if (!match) { + setText(merged); + textRef.current = merged; + lastTranscriptRef.current = transcript; + return; + } + + const textWithoutPhrase = replaceTrailingTranscribedText( + latest, + previous, + match.textWithoutPhrase, + ); + if (!textWithoutPhrase.trim()) return; + + stopRecordingRef.current(); + + if (sendDisabled) { + setText(textWithoutPhrase); + textRef.current = textWithoutPhrase; + return; + } + + onSend(textWithoutPhrase.trim()); + setText(""); + textRef.current = ""; + lastTranscriptRef.current = ""; + }, + [autoSubmitPhrases, onSend, sendDisabled, setText], + ); + + const dictation = useRealtimeDictation({ + onRecordingStart: () => { + lastTranscriptRef.current = ""; + }, + onTranscriptText: handleTranscript, + }); + stopRecordingRef.current = dictation.stopRecording; + + return dictation; +} diff --git a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts new file mode 100644 index 000000000..3e2f038f7 --- /dev/null +++ b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts @@ -0,0 +1,257 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { toast } from "sonner"; +import { + createTranscribeSession, + getTranscribeStatus, +} from "../api/transcribeSession"; +import { + type AudioBufferCapture, + type TranscriptEvent, + TRANSCRIPT_COMPLETED_EVENT, + TRANSCRIPT_DELTA_EVENT, + connectPeerConnection, + createAudioBufferCapture, + createPeerConnection, + flushAudioBuffer, + mergeTranscriptSegment, +} from "../lib/realtimeAudio"; + +interface UseRealtimeDictationOptions { + disabled?: boolean; + onRecordingStart?: () => void; + onTranscriptText: (text: string) => void; +} + +function closeResources(resources: { + audioCapture?: AudioBufferCapture | null; + dataChannel?: RTCDataChannel | null; + peerConnection?: RTCPeerConnection | null; + stream?: MediaStream | null; +}) { + resources.audioCapture?.close(); + resources.dataChannel?.close(); + resources.peerConnection?.close(); + for (const track of resources.stream?.getTracks() ?? []) { + track.stop(); + } +} + +export function useRealtimeDictation({ + disabled = false, + onRecordingStart, + onTranscriptText, +}: UseRealtimeDictationOptions) { + const [isRecording, setIsRecording] = useState(false); + const [isStarting, setIsStarting] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const [isConfigured, setIsConfigured] = useState(false); + + const peerConnectionRef = useRef(null); + const dataChannelRef = useRef(null); + const streamRef = useRef(null); + const audioCaptureRef = useRef(null); + const transcriptRef = useRef(""); + const activeRunIdRef = useRef(0); + const onRecordingStartRef = useRef(onRecordingStart); + const onTranscriptTextRef = useRef(onTranscriptText); + + onRecordingStartRef.current = onRecordingStart; + onTranscriptTextRef.current = onTranscriptText; + + const isEnabled = !disabled && isConfigured; + + // Check if transcription is configured on mount + useEffect(() => { + let cancelled = false; + getTranscribeStatus() + .then((status) => { + if (!cancelled) setIsConfigured(status.configured); + }) + .catch(() => { + if (!cancelled) setIsConfigured(false); + }); + return () => { + cancelled = true; + }; + }, []); + + const cleanupResources = useCallback(() => { + activeRunIdRef.current += 1; + closeResources({ + audioCapture: audioCaptureRef.current, + dataChannel: dataChannelRef.current, + peerConnection: peerConnectionRef.current, + stream: streamRef.current, + }); + audioCaptureRef.current = null; + dataChannelRef.current = null; + peerConnectionRef.current = null; + streamRef.current = null; + }, []); + + const cleanup = useCallback(() => { + cleanupResources(); + setIsRecording(false); + setIsStarting(false); + setIsTranscribing(false); + }, [cleanupResources]); + + useEffect(() => cleanupResources, [cleanupResources]); + + const handleRealtimeEvent = useCallback((event: TranscriptEvent) => { + if (event.type === "error") { + console.error("OpenAI realtime server error", event); + toast.error(event.error?.message ?? "Voice input error"); + return; + } + + if ( + event.type !== TRANSCRIPT_DELTA_EVENT && + event.type !== TRANSCRIPT_COMPLETED_EVENT + ) { + return; + } + + const text = event.delta ?? event.transcript ?? ""; + const merged = mergeTranscriptSegment(transcriptRef.current, text, event); + + if (merged === transcriptRef.current) return; + + transcriptRef.current = merged; + onTranscriptTextRef.current(merged); + setIsTranscribing(event.type !== TRANSCRIPT_COMPLETED_EVENT); + }, []); + + const startRecording = useCallback(async () => { + if (!isEnabled || isStarting || isRecording) return; + + const runId = activeRunIdRef.current + 1; + activeRunIdRef.current = runId; + const isStaleRun = () => activeRunIdRef.current !== runId; + + let stream: MediaStream | null = null; + let audioCapture: AudioBufferCapture | null = null; + let peerConnection: RTCPeerConnection | null = null; + let dataChannel: RTCDataChannel | null = null; + + setIsStarting(true); + transcriptRef.current = ""; + onRecordingStartRef.current?.(); + + try { + // 1. Capture mic immediately for instant feedback + stream = await navigator.mediaDevices.getUserMedia({ + audio: { + autoGainControl: true, + echoCancellation: true, + noiseSuppression: true, + }, + }); + if (isStaleRun()) { + closeResources({ stream }); + return; + } + streamRef.current = stream; + setIsRecording(true); + + // 2. Buffer PCM via AudioWorklet while network calls proceed + audioCapture = await createAudioBufferCapture(stream); + if (isStaleRun()) { + closeResources({ audioCapture, stream }); + return; + } + audioCaptureRef.current = audioCapture; + + // 3. Create session via relay + const session = await createTranscribeSession(); + if (isStaleRun()) { + closeResources({ audioCapture, stream }); + return; + } + + // 4. Set up WebRTC + peerConnection = createPeerConnection(); + peerConnectionRef.current = peerConnection; + const activeStream = stream; + stream.getAudioTracks().forEach((track) => { + peerConnection?.addTrack(track, activeStream); + }); + + dataChannel = peerConnection.createDataChannel("oai-events"); + dataChannelRef.current = dataChannel; + dataChannel.addEventListener("message", (message) => { + try { + handleRealtimeEvent(JSON.parse(String(message.data))); + } catch { + // Ignore non-JSON events + } + }); + + // Flush buffered audio once data channel opens + const channelToFlush = dataChannel; + const captureToFlush = audioCapture; + dataChannel.addEventListener("open", () => { + flushAudioBuffer(channelToFlush, captureToFlush.chunks); + captureToFlush.close(); + audioCaptureRef.current = null; + }); + + // 5. SDP exchange + await connectPeerConnection({ + peerConnection, + clientSecret: session.clientSecret, + }); + if (isStaleRun()) { + closeResources({ audioCapture, dataChannel, peerConnection, stream }); + return; + } + } catch (error) { + closeResources({ audioCapture, dataChannel, peerConnection, stream }); + if (!isStaleRun()) { + audioCaptureRef.current = null; + dataChannelRef.current = null; + peerConnectionRef.current = null; + streamRef.current = null; + setIsRecording(false); + setIsTranscribing(false); + + const message = + error instanceof Error ? error.message : "Voice input failed"; + if (/not allowed|denied|permission/i.test(message)) { + toast.error("Microphone access denied", { + description: + "Allow microphone access in System Settings to use dictation.", + }); + } else if (/not found|no audio/i.test(message)) { + toast.error("No microphone found", { + description: "Connect a microphone and try again.", + }); + } else { + toast.error("Voice input failed", { description: message }); + } + } + } finally { + if (!isStaleRun()) setIsStarting(false); + } + }, [handleRealtimeEvent, isEnabled, isRecording, isStarting]); + + const stopRecording = useCallback(() => cleanup(), [cleanup]); + + const toggleRecording = useCallback(() => { + if (isRecording || isStarting) { + stopRecording(); + return; + } + void startRecording(); + }, [isRecording, isStarting, startRecording, stopRecording]); + + return { + isEnabled, + isRecording, + isStarting, + isTranscribing, + startRecording, + stopRecording, + toggleRecording, + }; +} diff --git a/desktop/src/features/dictation/index.ts b/desktop/src/features/dictation/index.ts new file mode 100644 index 000000000..1578aaf65 --- /dev/null +++ b/desktop/src/features/dictation/index.ts @@ -0,0 +1,3 @@ +export { useComposerDictation } from "./hooks/useComposerDictation"; +export { useDictation } from "./hooks/useDictation"; +export { DictationButton } from "./ui/DictationButton"; diff --git a/desktop/src/features/dictation/lib/realtimeAudio.ts b/desktop/src/features/dictation/lib/realtimeAudio.ts new file mode 100644 index 000000000..59ae5ce0d --- /dev/null +++ b/desktop/src/features/dictation/lib/realtimeAudio.ts @@ -0,0 +1,145 @@ +import { + REALTIME_BUFFER_PROCESSOR_NAME, + createWorkletBlobUrl, +} from "./realtimeBufferWorklet"; + +export const OPENAI_REALTIME_WEBRTC_URL = + "https://api.openai.com/v1/realtime/calls"; +export const TRANSCRIPT_DELTA_EVENT = + "conversation.item.input_audio_transcription.delta"; +export const TRANSCRIPT_COMPLETED_EVENT = + "conversation.item.input_audio_transcription.completed"; + +const MAX_BUFFER_CHUNKS = 500; // ~10s at 20ms per chunk + +export type TranscriptEvent = { + type?: string; + item_id?: string; + content_index?: number; + delta?: string; + transcript?: string; + message?: string; + error?: { message?: string }; +}; + +export function createPeerConnection(): RTCPeerConnection { + return new RTCPeerConnection(); +} + +export async function connectPeerConnection(options: { + peerConnection: RTCPeerConnection; + clientSecret: string; +}): Promise { + const offer = await options.peerConnection.createOffer(); + await options.peerConnection.setLocalDescription(offer); + + const response = await fetch(OPENAI_REALTIME_WEBRTC_URL, { + method: "POST", + headers: { + Authorization: `Bearer ${options.clientSecret}`, + "Content-Type": "application/sdp", + }, + body: offer.sdp ?? "", + }); + + const body = await response.text(); + if (!response.ok) { + throw new Error( + `OpenAI realtime connection failed (${response.status}): ${body}`, + ); + } + + await options.peerConnection.setRemoteDescription({ + type: "answer", + sdp: body, + }); +} + +export function mergeTranscriptSegment( + currentText: string, + segmentText: string, + event: TranscriptEvent, +): string { + if (!segmentText) return currentText; + if (!currentText) return segmentText; + + // Completed events re-send the full segment text; skip if already present. + if (event.type === TRANSCRIPT_COMPLETED_EVENT) { + const normalizedCurrent = currentText.trimEnd().toLowerCase(); + const normalizedText = segmentText.trim().toLowerCase(); + if (normalizedCurrent.endsWith(normalizedText)) { + return currentText; + } + } + + return currentText + segmentText; +} + +// ── Audio buffer capture ────────────────────────────────────────────────── + +export interface AudioBufferCapture { + chunks: Int16Array[]; + close(): void; +} + +export async function createAudioBufferCapture( + stream: MediaStream, +): Promise { + const audioContext = new AudioContext(); + const blobUrl = createWorkletBlobUrl(); + try { + await audioContext.audioWorklet.addModule(blobUrl); + } finally { + URL.revokeObjectURL(blobUrl); + } + + const source = audioContext.createMediaStreamSource(stream); + const worklet = new AudioWorkletNode( + audioContext, + REALTIME_BUFFER_PROCESSOR_NAME, + ); + source.connect(worklet); + worklet.connect(audioContext.destination); + + const chunks: Int16Array[] = []; + worklet.port.onmessage = (event: MessageEvent) => { + if (chunks.length < MAX_BUFFER_CHUNKS) { + chunks.push(new Int16Array(event.data)); + } + }; + + return { + chunks, + close() { + worklet.disconnect(); + source.disconnect(); + void audioContext.close(); + }, + }; +} + +// ── Flush buffered PCM into the data channel ────────────────────────────── + +function int16ToBase64(pcm: Int16Array): string { + const bytes = new Uint8Array(pcm.buffer, pcm.byteOffset, pcm.byteLength); + let binary = ""; + for (let i = 0; i < bytes.length; i++) { + binary += String.fromCharCode(bytes[i]); + } + return btoa(binary); +} + +export function flushAudioBuffer( + dataChannel: RTCDataChannel, + chunks: Int16Array[], +): void { + for (const chunk of chunks) { + dataChannel.send( + JSON.stringify({ + type: "input_audio_buffer.append", + audio: int16ToBase64(chunk), + }), + ); + } + chunks.length = 0; +} diff --git a/desktop/src/features/dictation/lib/realtimeBufferWorklet.ts b/desktop/src/features/dictation/lib/realtimeBufferWorklet.ts new file mode 100644 index 000000000..ac8ffaabd --- /dev/null +++ b/desktop/src/features/dictation/lib/realtimeBufferWorklet.ts @@ -0,0 +1,53 @@ +const TARGET_SAMPLE_RATE = 24000; +const FRAME_SAMPLES = 480; // 20ms at 24kHz + +export const REALTIME_BUFFER_PROCESSOR_NAME = "realtime-buffer-processor"; + +export const REALTIME_BUFFER_WORKLET_SOURCE = /* js */ ` +class RealtimeBufferProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this._ratio = sampleRate / ${TARGET_SAMPLE_RATE}; + this._offset = 0; + this._buf = new Float32Array(${FRAME_SAMPLES}); + this._idx = 0; + } + + process(inputs) { + const input = inputs[0]?.[0]; + if (!input) return true; + + while (this._offset < input.length) { + const i = Math.floor(this._offset); + const frac = this._offset - i; + const s0 = input[i]; + const s1 = i + 1 < input.length ? input[i + 1] : s0; + this._buf[this._idx++] = s0 + frac * (s1 - s0); + + if (this._idx >= ${FRAME_SAMPLES}) { + const pcm = new Int16Array(${FRAME_SAMPLES}); + for (let j = 0; j < ${FRAME_SAMPLES}; j++) { + const s = Math.max(-1, Math.min(1, this._buf[j])); + pcm[j] = s < 0 ? s * 0x8000 : s * 0x7fff; + } + this.port.postMessage(pcm.buffer, [pcm.buffer]); + this._idx = 0; + } + this._offset += this._ratio; + } + this._offset -= input.length; + return true; + } +} + +registerProcessor('${REALTIME_BUFFER_PROCESSOR_NAME}', RealtimeBufferProcessor); +`; + +/** Create a blob URL that can be passed to `audioWorklet.addModule()`. */ +export function createWorkletBlobUrl(): string { + return URL.createObjectURL( + new Blob([REALTIME_BUFFER_WORKLET_SOURCE], { + type: "application/javascript", + }), + ); +} diff --git a/desktop/src/features/dictation/lib/voiceInput.ts b/desktop/src/features/dictation/lib/voiceInput.ts new file mode 100644 index 000000000..334deda02 --- /dev/null +++ b/desktop/src/features/dictation/lib/voiceInput.ts @@ -0,0 +1,108 @@ +export const DEFAULT_AUTO_SUBMIT_PHRASE = "submit"; + +const TRAILING_PUNCTUATION_REGEX = /[\s"'`.,!?;:)\]}]+$/u; + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function normalizePhrase(value: string): string { + return value + .toLowerCase() + .replace(/\s+/g, " ") + .trim() + .replace(TRAILING_PUNCTUATION_REGEX, "") + .trim(); +} + +export function parseAutoSubmitPhrases( + rawValue: string | null | undefined, +): string[] { + if (!rawValue) return []; + return Array.from( + new Set( + rawValue + .split(",") + .map((value) => normalizePhrase(value)) + .filter(Boolean), + ), + ); +} + +function appendTranscribedText(baseText: string, fragment: string): string { + const normalizedFragment = fragment.replace(/\s+/g, " ").trim(); + if (!normalizedFragment) return baseText; + if (!baseText.trim()) return normalizedFragment; + if (/[\s([{/-]$/.test(baseText) || /^[,.;!?)]/.test(normalizedFragment)) { + return `${baseText}${normalizedFragment}`; + } + return `${baseText} ${normalizedFragment}`; +} + +export function replaceTrailingTranscribedText( + fullText: string, + previousTranscribedText: string, + nextTranscribedText: string, +): string { + if (!previousTranscribedText) { + return appendTranscribedText(fullText, nextTranscribedText); + } + + if (fullText.endsWith(previousTranscribedText)) { + return appendTranscribedText( + fullText.slice(0, -previousTranscribedText.length), + nextTranscribedText, + ); + } + + const trimmedPreviousText = previousTranscribedText.trim(); + if (trimmedPreviousText && fullText.endsWith(trimmedPreviousText)) { + return appendTranscribedText( + fullText.slice(0, -trimmedPreviousText.length), + nextTranscribedText, + ); + } + + return appendTranscribedText(fullText, nextTranscribedText); +} + +export function getAutoSubmitMatch( + transcribedText: string, + autoSubmitPhrases: string[], +): { matchedPhrase: string; textWithoutPhrase: string } | null { + const normalizedTranscribedText = normalizePhrase(transcribedText); + if (!normalizedTranscribedText) return null; + + const sortedPhrases = [...autoSubmitPhrases].sort( + (left, right) => right.length - left.length, + ); + + for (const phrase of sortedPhrases) { + if (!normalizedTranscribedText.endsWith(phrase)) continue; + + const phraseStartIndex = normalizedTranscribedText.length - phrase.length; + if ( + phraseStartIndex > 0 && + normalizedTranscribedText[phraseStartIndex - 1] !== " " + ) { + continue; + } + + const trimmedText = transcribedText.replace(TRAILING_PUNCTUATION_REGEX, ""); + const phraseWords = phrase.split(" ").filter(Boolean).map(escapeRegExp); + const phrasePattern = new RegExp( + `(^|\\s)(${phraseWords.join("\\s+")})\\s*$`, + "iu", + ); + const rawMatch = trimmedText.match(phrasePattern); + const phraseStartOffset = + rawMatch && rawMatch.index !== undefined + ? rawMatch.index + (rawMatch[1]?.length ?? 0) + : trimmedText.length - phrase.length; + const textWithoutPhrase = trimmedText.slice(0, phraseStartOffset).trimEnd(); + + return { matchedPhrase: phrase, textWithoutPhrase }; + } + + return null; +} diff --git a/desktop/src/features/dictation/ui/DictationButton.tsx b/desktop/src/features/dictation/ui/DictationButton.tsx new file mode 100644 index 000000000..b43249cdf --- /dev/null +++ b/desktop/src/features/dictation/ui/DictationButton.tsx @@ -0,0 +1,54 @@ +import { Mic } from "lucide-react"; +import { Button } from "@/shared/ui/button"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/shared/ui/tooltip"; +import { cn } from "@/shared/lib/cn"; + +interface DictationState { + isEnabled: boolean; + isRecording: boolean; + isStarting: boolean; + isTranscribing: boolean; + toggleRecording: () => void; +} + +interface DictationButtonProps { + dictation: DictationState; + disabled?: boolean; +} + +export function DictationButton({ + dictation, + disabled = false, +}: DictationButtonProps) { + if (!dictation.isEnabled) return null; + + const tooltipText = dictation.isRecording + ? "Stop recording" + : dictation.isTranscribing + ? "Transcribing…" + : "Dictate message"; + + return ( + + + + + {tooltipText} + + ); +} diff --git a/desktop/src/features/messages/ui/MessageComposer.tsx b/desktop/src/features/messages/ui/MessageComposer.tsx index f30d039ff..e7c441600 100644 --- a/desktop/src/features/messages/ui/MessageComposer.tsx +++ b/desktop/src/features/messages/ui/MessageComposer.tsx @@ -51,6 +51,7 @@ import { MessageComposerToolbar } from "./MessageComposerToolbar"; import { NonMemberMentionDialog } from "./NonMemberMentionDialog"; import { useMentionSendFlow } from "./useMentionSendFlow"; import { useComposerContentState } from "./useComposerContentState"; +import { DictationButton, useComposerDictation } from "@/features/dictation"; type MessageComposerProps = { channelId?: string | null; @@ -213,6 +214,15 @@ function MessageComposerImpl({ emojiAutocomplete.isEmojiAutocompleteOpen; const submitMessageRef = React.useRef<() => void>(() => {}); + + const dictation = useComposerDictation({ + contentRef, + disabled, + isSending, + setComposerContentFromText, + submitMessageRef, + }); + const composerScrollRef = React.useRef(null); // Set after `useLinkEditor` exists below; the editor's link-click handler @@ -943,7 +953,12 @@ function MessageComposerImpl({ + + {toolbarExtraActions} + + } formattingDisabled={disabled} isEmojiPickerOpen={isEmojiPickerOpen} isFormattingOpen={isFormattingOpen} From da4b789cc883ef9c3f20154b646fce25d0fb37b2 Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sat, 4 Jul 2026 15:33:06 +0100 Subject: [PATCH 02/12] style: make dictation button rounded-full to match send button Signed-off-by: klopez4212 --- desktop/src/features/dictation/ui/DictationButton.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/desktop/src/features/dictation/ui/DictationButton.tsx b/desktop/src/features/dictation/ui/DictationButton.tsx index b43249cdf..dabefa4a3 100644 --- a/desktop/src/features/dictation/ui/DictationButton.tsx +++ b/desktop/src/features/dictation/ui/DictationButton.tsx @@ -35,6 +35,7 @@ export function DictationButton({ aria-label={tooltipText} aria-pressed={dictation.isRecording} className={cn( + "rounded-full", dictation.isRecording && "bg-destructive text-destructive-foreground hover:bg-destructive/90 hover:text-destructive-foreground active:bg-destructive active:text-destructive-foreground", dictation.isTranscribing && "animate-pulse", From d83671de47d0703b71249bb5b7b6a01db1b47811 Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sat, 4 Jul 2026 15:40:21 +0100 Subject: [PATCH 03/12] fix(relay): add doc comments to transcribe response structs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New public API needs doc comments — clippy runs with -D missing-docs, so TranscribeStatus and TranscribeSession were failing the Rust Lint gate. Signed-off-by: klopez4212 --- crates/buzz-relay/src/api/transcribe.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs index 915dcf5b4..633a3f221 100644 --- a/crates/buzz-relay/src/api/transcribe.rs +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -13,12 +13,16 @@ use crate::state::AppState; const OPENAI_REALTIME_SESSIONS_URL: &str = "https://api.openai.com/v1/realtime/sessions"; const DEFAULT_TRANSCRIPTION_MODEL: &str = "whisper-1"; +/// Response for `GET /transcribe/status`, reporting whether the relay can mint +/// OpenAI Realtime transcription sessions and which model it will use. #[derive(Serialize)] pub struct TranscribeStatus { configured: bool, model: String, } +/// Response for `POST /transcribe/session`, carrying the ephemeral OpenAI +/// Realtime client secret the desktop app uses to open its WebRTC connection. #[derive(Serialize)] #[serde(rename_all = "camelCase")] pub struct TranscribeSession { From d123e41fff77b605ab929b2c111720e6fffdeffd Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sat, 4 Jul 2026 15:51:04 +0100 Subject: [PATCH 04/12] fix(relay): add NIP-98 auth + relay membership to transcribe endpoints Both /transcribe/status and /transcribe/session now require NIP-98 authentication and relay membership (with NIP-OA fallback), matching the security posture of /events, /query, and /count. Promotes verify_bridge_auth, check_nip98_replay, and nip98_expected_url to pub(crate) so the transcribe module can reuse them without duplication. Signed-off-by: klopez4212 --- crates/buzz-relay/src/api/bridge.rs | 10 +- crates/buzz-relay/src/api/transcribe.rs | 130 +++++++++++++++++------- 2 files changed, 98 insertions(+), 42 deletions(-) diff --git a/crates/buzz-relay/src/api/bridge.rs b/crates/buzz-relay/src/api/bridge.rs index 59093a060..625b052e2 100644 --- a/crates/buzz-relay/src/api/bridge.rs +++ b/crates/buzz-relay/src/api/bridge.rs @@ -25,7 +25,7 @@ use super::{api_error, internal_error, not_found}; /// /// Returns the authenticated public key and an event ID for replay detection. /// For X-Pubkey dev mode, the event ID is a zero hash (no replay concern). -fn verify_bridge_auth( +pub(crate) fn verify_bridge_auth( headers: &HeaderMap, method: &str, url: &str, @@ -76,7 +76,7 @@ fn verify_bridge_auth( /// `AppState`, not process-local memory. Any Redis/guard error fails closed: /// without the shared `SET NX EX` proof, a stateless worker cannot admit the /// NIP-98 request safely. -async fn check_nip98_replay( +pub(crate) async fn check_nip98_replay( state: &AppState, tenant: &TenantContext, event_id_bytes: [u8; 32], @@ -135,7 +135,11 @@ async fn check_nip98_replay_with_guard( /// pass and the relay would proceed against the wrong tenant's auth context), /// and (b) reject every legitimate request whose community host isn't the /// single configured one. Substituting `tenant.host()` closes both directions. -fn nip98_expected_url(config_relay_url: &str, tenant: &TenantContext, path: &str) -> String { +pub(crate) fn nip98_expected_url( + config_relay_url: &str, + tenant: &TenantContext, + path: &str, +) -> String { let scheme = if config_relay_url.trim_start().starts_with("wss://") { "https" } else { diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs index 633a3f221..9210938cc 100644 --- a/crates/buzz-relay/src/api/transcribe.rs +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -3,26 +3,34 @@ //! When `BUZZ_OPENAI_API_KEY` is configured, the relay can mint ephemeral client //! secrets for the OpenAI Realtime API. The desktop app uses these to establish a //! WebRTC connection for real-time speech-to-text dictation. +//! +//! Both endpoints require NIP-98 auth (same as `/events`, `/query`, `/count`). -use axum::{extract::State, http::StatusCode, response::Json}; -use serde::Serialize; use std::sync::Arc; +use axum::{ + extract::State, + http::{HeaderMap, StatusCode}, + response::Json, +}; +use serde::Serialize; +use serde_json::Value; + use crate::state::AppState; +use super::api_error; + const OPENAI_REALTIME_SESSIONS_URL: &str = "https://api.openai.com/v1/realtime/sessions"; const DEFAULT_TRANSCRIPTION_MODEL: &str = "whisper-1"; -/// Response for `GET /transcribe/status`, reporting whether the relay can mint -/// OpenAI Realtime transcription sessions and which model it will use. +/// Response for `GET /transcribe/status`. #[derive(Serialize)] pub struct TranscribeStatus { configured: bool, model: String, } -/// Response for `POST /transcribe/session`, carrying the ephemeral OpenAI -/// Realtime client secret the desktop app uses to open its WebRTC connection. +/// Response for `POST /transcribe/session`. #[derive(Serialize)] #[serde(rename_all = "camelCase")] pub struct TranscribeSession { @@ -31,27 +39,36 @@ pub struct TranscribeSession { } /// `GET /transcribe/status` — check if transcription is configured. -pub async fn transcribe_status(State(state): State>) -> Json { - Json(TranscribeStatus { +/// +/// Requires NIP-98 auth. Returns whether the relay has an OpenAI API key +/// configured for real-time transcription. +pub async fn transcribe_status( + State(state): State>, + headers: HeaderMap, +) -> Result, (StatusCode, Json)> { + authenticate(&state, &headers, "/transcribe/status", "GET").await?; + + Ok(Json(TranscribeStatus { configured: state.config.openai_api_key.is_some(), model: transcription_model(), - }) + })) } /// `POST /transcribe/session` — create an ephemeral OpenAI Realtime session. /// -/// Returns a short-lived client secret that the frontend uses to establish -/// a WebRTC connection directly with OpenAI for real-time transcription. +/// Requires NIP-98 auth. Returns a short-lived client secret that the frontend +/// uses to establish a WebRTC connection directly with OpenAI for real-time +/// transcription. pub async fn create_transcribe_session( State(state): State>, -) -> Result, (StatusCode, Json)> { + headers: HeaderMap, +) -> Result, (StatusCode, Json)> { + authenticate(&state, &headers, "/transcribe/session", "POST").await?; + let api_key = state.config.openai_api_key.as_deref().ok_or_else(|| { - ( + api_error( StatusCode::SERVICE_UNAVAILABLE, - Json(serde_json::json!({ - "error": "transcription_not_configured", - "message": "Transcription is not configured on this relay" - })), + "transcription is not configured on this relay", ) })?; @@ -77,12 +94,9 @@ pub async fn create_transcribe_session( .await .map_err(|e| { tracing::error!("OpenAI realtime session request failed: {e}"); - ( + api_error( StatusCode::BAD_GATEWAY, - Json(serde_json::json!({ - "error": "upstream_error", - "message": "Failed to create transcription session" - })), + "failed to create transcription session", ) })?; @@ -90,34 +104,25 @@ pub async fn create_transcribe_session( let status = response.status(); let body = response.text().await.unwrap_or_default(); tracing::error!("OpenAI realtime session error ({status}): {body}"); - return Err(( + return Err(api_error( StatusCode::BAD_GATEWAY, - Json(serde_json::json!({ - "error": "upstream_error", - "message": "OpenAI rejected the transcription session request" - })), + "OpenAI rejected the transcription session request", )); } - let body: serde_json::Value = response.json().await.map_err(|e| { + let body: Value = response.json().await.map_err(|e| { tracing::error!("OpenAI realtime session response parse error: {e}"); - ( + api_error( StatusCode::BAD_GATEWAY, - Json(serde_json::json!({ - "error": "upstream_error", - "message": "Invalid response from transcription service" - })), + "invalid response from transcription service", ) })?; let client_secret = extract_client_secret(&body).ok_or_else(|| { tracing::error!("OpenAI realtime session response missing client_secret: {body}"); - ( + api_error( StatusCode::BAD_GATEWAY, - Json(serde_json::json!({ - "error": "upstream_error", - "message": "Transcription service returned unexpected response" - })), + "transcription service returned unexpected response", ) })?; @@ -127,6 +132,53 @@ pub async fn create_transcribe_session( })) } +// ── Helpers ─────────────────────────────────────────────────────────────────── + +/// Authenticate the request using the same NIP-98 / X-Pubkey pattern as the +/// bridge endpoints, plus replay detection and relay membership enforcement. +async fn authenticate( + state: &AppState, + headers: &HeaderMap, + path: &str, + method: &str, +) -> Result<(), (StatusCode, Json)> { + let raw_host = headers + .get("host") + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + let tenant = crate::tenant::bind_community(&state.db, raw_host) + .await + .map_err(|_| { + api_error( + StatusCode::NOT_FOUND, + "relay: no community is configured for this host", + ) + })?; + + let url = super::bridge::nip98_expected_url(&state.config.relay_url, &tenant, path); + let (pubkey, event_id_bytes) = super::bridge::verify_bridge_auth( + headers, + method, + &url, + None, + state.config.require_auth_token, + )?; + super::bridge::check_nip98_replay(state, &tenant, event_id_bytes).await?; + + // Enforce relay membership (with NIP-OA fallback via x-auth-tag header). + let pubkey_bytes = pubkey.to_bytes().to_vec(); + let auth_tag = headers.get("x-auth-tag").and_then(|v| v.to_str().ok()); + super::relay_members::enforce_relay_membership( + state, + tenant.community(), + &pubkey_bytes, + auth_tag, + ) + .await?; + + Ok(()) +} + fn transcription_model() -> String { std::env::var("BUZZ_TRANSCRIPTION_MODEL") .ok() @@ -134,7 +186,7 @@ fn transcription_model() -> String { .unwrap_or_else(|| DEFAULT_TRANSCRIPTION_MODEL.to_string()) } -fn extract_client_secret(value: &serde_json::Value) -> Option { +fn extract_client_secret(value: &Value) -> Option { // Shape 1: { "client_secret": { "value": "..." } } if let Some(cs) = value.get("client_secret") { if let Some(v) = cs.get("value").and_then(|v| v.as_str()) { From 111a9227d79873f4589eb56bbc787d7ceffad660 Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sat, 4 Jul 2026 15:59:54 +0100 Subject: [PATCH 05/12] =?UTF-8?q?fix(dictation):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20nonce,=20editor=20state,=20segment=20tracking?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add nonce tag to NIP-98 auth events to prevent replay rejection when multiple components call /transcribe/status in the same second. - Wire dictation text into both the Tiptap editor and contentRef via setComposerContent + setEditorContentRef, so dictated text actually appears in the composer and is serialized on submit. - Call submitMessageRef.current() synchronously in onSend instead of via queueMicrotask, ensuring the editor content is consumed before the subsequent setText('') clears it. - Replace naive append-based transcript merging with segment-aware state tracking (TranscriptSegmentState). Delta events accumulate into pendingDelta; completed events replace accumulated deltas with the finalized text, preventing duplication. Signed-off-by: klopez4212 --- .../dictation/api/transcribeSession.ts | 42 +++++- .../dictation/hooks/useComposerDictation.ts | 23 ++- .../dictation/hooks/useRealtimeDictation.ts | 24 ++- .../dictation/lib/realtimeAudio.test.mjs | 110 ++++++++++++++ .../features/dictation/lib/realtimeAudio.ts | 52 +++++-- .../dictation/lib/voiceInput.test.mjs | 142 ++++++++++++++++++ .../features/messages/ui/MessageComposer.tsx | 7 +- 7 files changed, 367 insertions(+), 33 deletions(-) create mode 100644 desktop/src/features/dictation/lib/realtimeAudio.test.mjs create mode 100644 desktop/src/features/dictation/lib/voiceInput.test.mjs diff --git a/desktop/src/features/dictation/api/transcribeSession.ts b/desktop/src/features/dictation/api/transcribeSession.ts index dddfe89c3..276ca495a 100644 --- a/desktop/src/features/dictation/api/transcribeSession.ts +++ b/desktop/src/features/dictation/api/transcribeSession.ts @@ -1,4 +1,4 @@ -import { getRelayHttpUrl } from "@/shared/api/tauri"; +import { getRelayHttpUrl, signRelayEvent } from "@/shared/api/tauri"; export interface TranscribeStatus { configured: boolean; @@ -10,9 +10,39 @@ export interface TranscribeSession { model: string; } +/** NIP-98 event kind for HTTP request authorization. */ +const NIP98_KIND = 27235; + +/** + * Build a NIP-98 `Authorization: Nostr ` header for an HTTP request. + * + * The relay verifies the signed event's `u` tag against its own + * host-derived expected URL, so `url` must be the exact absolute URL being + * fetched (scheme + host + path). The `method` tag must match the request. + */ +async function nip98AuthHeader(url: string, method: string): Promise { + const nonce = crypto.randomUUID(); + const event = await signRelayEvent({ + kind: NIP98_KIND, + content: "", + tags: [ + ["u", url], + ["method", method], + ["nonce", nonce], + ], + }); + const json = JSON.stringify(event); + // btoa needs a binary string; encode UTF-8 first so non-ASCII survives. + const base64 = btoa(String.fromCharCode(...new TextEncoder().encode(json))); + return `Nostr ${base64}`; +} + export async function getTranscribeStatus(): Promise { const baseUrl = await getRelayHttpUrl(); - const response = await fetch(`${baseUrl}/transcribe/status`); + const url = `${baseUrl}/transcribe/status`; + const response = await fetch(url, { + headers: { Authorization: await nip98AuthHeader(url, "GET") }, + }); if (!response.ok) { throw new Error(`Transcribe status check failed: ${response.status}`); } @@ -21,9 +51,13 @@ export async function getTranscribeStatus(): Promise { export async function createTranscribeSession(): Promise { const baseUrl = await getRelayHttpUrl(); - const response = await fetch(`${baseUrl}/transcribe/session`, { + const url = `${baseUrl}/transcribe/session`; + const response = await fetch(url, { method: "POST", - headers: { "Content-Type": "application/json" }, + headers: { + "Content-Type": "application/json", + Authorization: await nip98AuthHeader(url, "POST"), + }, }); if (!response.ok) { const body = await response.text().catch(() => ""); diff --git a/desktop/src/features/dictation/hooks/useComposerDictation.ts b/desktop/src/features/dictation/hooks/useComposerDictation.ts index e40f23f2e..d907062be 100644 --- a/desktop/src/features/dictation/hooks/useComposerDictation.ts +++ b/desktop/src/features/dictation/hooks/useComposerDictation.ts @@ -5,27 +5,38 @@ interface UseComposerDictationOptions { contentRef: React.MutableRefObject; disabled: boolean; isSending: boolean; - setComposerContentFromText: (text: string) => void; + /** Updates contentRef + isContentEmpty state. */ + setComposerContent: (text: string) => void; + /** Ref to a function that updates the Tiptap editor document. */ + setEditorContentRef: React.MutableRefObject<(text: string) => void>; submitMessageRef: React.MutableRefObject<() => void>; } /** * Thin wrapper around `useDictation` pre-wired for the MessageComposer's - * state management (contentRef, setComposerContentFromText, submitMessageRef). + * state management (contentRef, setComposerContent, editor, submitMessageRef). */ export function useComposerDictation({ contentRef, disabled, isSending, - setComposerContentFromText, + setComposerContent, + setEditorContentRef, submitMessageRef, }: UseComposerDictationOptions) { return useDictation({ text: contentRef.current, - setText: setComposerContentFromText, + setText: (text) => { + setComposerContent(text); + setEditorContentRef.current(text); + }, onSend: (text) => { - setComposerContentFromText(text); - queueMicrotask(() => submitMessageRef.current()); + setComposerContent(text); + setEditorContentRef.current(text); + // Submit synchronously — the content ref is already set above, so + // syncComposerContentFromEditor() will serialize the editor which now + // holds the dictated text. + submitMessageRef.current(); }, sendDisabled: disabled || isSending, }); diff --git a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts index 3e2f038f7..ce289e279 100644 --- a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts +++ b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts @@ -7,13 +7,15 @@ import { import { type AudioBufferCapture, type TranscriptEvent, + type TranscriptSegmentState, TRANSCRIPT_COMPLETED_EVENT, TRANSCRIPT_DELTA_EVENT, connectPeerConnection, createAudioBufferCapture, createPeerConnection, + createTranscriptSegmentState, flushAudioBuffer, - mergeTranscriptSegment, + mergeTranscriptEvent, } from "../lib/realtimeAudio"; interface UseRealtimeDictationOptions { @@ -50,7 +52,9 @@ export function useRealtimeDictation({ const dataChannelRef = useRef(null); const streamRef = useRef(null); const audioCaptureRef = useRef(null); - const transcriptRef = useRef(""); + const segmentStateRef = useRef( + createTranscriptSegmentState(), + ); const activeRunIdRef = useRef(0); const onRecordingStartRef = useRef(onRecordingStart); const onTranscriptTextRef = useRef(onTranscriptText); @@ -112,12 +116,12 @@ export function useRealtimeDictation({ return; } - const text = event.delta ?? event.transcript ?? ""; - const merged = mergeTranscriptSegment(transcriptRef.current, text, event); + const prevText = + segmentStateRef.current.committed + segmentStateRef.current.pendingDelta; + const merged = mergeTranscriptEvent(segmentStateRef.current, event); - if (merged === transcriptRef.current) return; + if (merged === prevText) return; - transcriptRef.current = merged; onTranscriptTextRef.current(merged); setIsTranscribing(event.type !== TRANSCRIPT_COMPLETED_EVENT); }, []); @@ -135,7 +139,7 @@ export function useRealtimeDictation({ let dataChannel: RTCDataChannel | null = null; setIsStarting(true); - transcriptRef.current = ""; + segmentStateRef.current = createTranscriptSegmentState(); onRecordingStartRef.current?.(); try { @@ -191,6 +195,12 @@ export function useRealtimeDictation({ const channelToFlush = dataChannel; const captureToFlush = audioCapture; dataChannel.addEventListener("open", () => { + // If the user stopped (or restarted) recording between the SDP + // exchange and the channel opening, drop this run's buffered audio. + if (isStaleRun()) { + captureToFlush.close(); + return; + } flushAudioBuffer(channelToFlush, captureToFlush.chunks); captureToFlush.close(); audioCaptureRef.current = null; diff --git a/desktop/src/features/dictation/lib/realtimeAudio.test.mjs b/desktop/src/features/dictation/lib/realtimeAudio.test.mjs new file mode 100644 index 000000000..e2fe52cb5 --- /dev/null +++ b/desktop/src/features/dictation/lib/realtimeAudio.test.mjs @@ -0,0 +1,110 @@ +import { describe, it } from "node:test"; +import assert from "node:assert/strict"; + +// We test the pure logic via dynamic import of the TS source compiled by the +// test runner (vitest/tsx). Since this is an .mjs file run by the Node test +// runner through the desktop vitest config, import the built output or use +// a direct TS import if the runner supports it. + +// Inline the logic to keep the test self-contained and avoid bundler issues. +const TRANSCRIPT_DELTA_EVENT = + "conversation.item.input_audio_transcription.delta"; +const TRANSCRIPT_COMPLETED_EVENT = + "conversation.item.input_audio_transcription.completed"; + +function createTranscriptSegmentState() { + return { committed: "", pendingDelta: "" }; +} + +function mergeTranscriptEvent(state, event) { + if (event.type === TRANSCRIPT_DELTA_EVENT) { + const delta = event.delta ?? ""; + if (delta) { + state.pendingDelta += delta; + } + } else if (event.type === TRANSCRIPT_COMPLETED_EVENT) { + const finalText = event.transcript ?? ""; + const separator = state.committed && finalText ? "" : ""; + state.committed = state.committed + separator + finalText; + state.pendingDelta = ""; + } + + return state.committed + state.pendingDelta; +} + +describe("mergeTranscriptEvent", () => { + it("accumulates delta events", () => { + const state = createTranscriptSegmentState(); + const r1 = mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "hello ", + }); + assert.equal(r1, "hello "); + + const r2 = mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "world", + }); + assert.equal(r2, "hello world"); + }); + + it("replaces deltas with finalized text on completed event", () => { + const state = createTranscriptSegmentState(); + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "hello world", + }); + + // Completed event carries corrected/punctuated version + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + transcript: "Hello, world.", + }); + assert.equal(result, "Hello, world."); + assert.equal(state.committed, "Hello, world."); + assert.equal(state.pendingDelta, ""); + }); + + it("handles multiple segments sequentially", () => { + const state = createTranscriptSegmentState(); + + // First segment + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "first ", + }); + mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + transcript: "First. ", + }); + + // Second segment + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "second", + }); + assert.equal(state.committed + state.pendingDelta, "First. second"); + + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + transcript: "Second.", + }); + assert.equal(result, "First. Second."); + }); + + it("does not duplicate text on completed event", () => { + const state = createTranscriptSegmentState(); + + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "hello world", + }); + + // Without the fix, this would append: "hello worldHello, world." + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + transcript: "Hello, world.", + }); + assert.equal(result, "Hello, world."); + }); +}); diff --git a/desktop/src/features/dictation/lib/realtimeAudio.ts b/desktop/src/features/dictation/lib/realtimeAudio.ts index 59ae5ce0d..bed28a727 100644 --- a/desktop/src/features/dictation/lib/realtimeAudio.ts +++ b/desktop/src/features/dictation/lib/realtimeAudio.ts @@ -55,24 +55,50 @@ export async function connectPeerConnection(options: { }); } -export function mergeTranscriptSegment( - currentText: string, - segmentText: string, +/** + * State for tracking the current transcription segment. Completed events + * carry the final text for the same segment that prior deltas built up, + * potentially with corrections/punctuation. We track the delta accumulation + * so we can replace it with the finalized text on completion. + */ +export interface TranscriptSegmentState { + /** Text committed from previous (completed) segments. */ + committed: string; + /** Accumulated delta text for the in-progress segment. */ + pendingDelta: string; +} + +export function createTranscriptSegmentState(): TranscriptSegmentState { + return { committed: "", pendingDelta: "" }; +} + +/** + * Merge a transcript event into the segment state. + * + * - Delta events: append to `pendingDelta`. + * - Completed events: replace `pendingDelta` with the finalized transcript, + * then commit it (move to `committed` and reset `pendingDelta`). + * + * Returns the full merged text (committed + pending). + */ +export function mergeTranscriptEvent( + state: TranscriptSegmentState, event: TranscriptEvent, ): string { - if (!segmentText) return currentText; - if (!currentText) return segmentText; - - // Completed events re-send the full segment text; skip if already present. - if (event.type === TRANSCRIPT_COMPLETED_EVENT) { - const normalizedCurrent = currentText.trimEnd().toLowerCase(); - const normalizedText = segmentText.trim().toLowerCase(); - if (normalizedCurrent.endsWith(normalizedText)) { - return currentText; + if (event.type === TRANSCRIPT_DELTA_EVENT) { + const delta = event.delta ?? ""; + if (delta) { + state.pendingDelta += delta; } + } else if (event.type === TRANSCRIPT_COMPLETED_EVENT) { + const finalText = event.transcript ?? ""; + // Replace the accumulated deltas with the finalized text, then commit. + const separator = state.committed && finalText ? "" : ""; + state.committed = state.committed + separator + finalText; + state.pendingDelta = ""; } - return currentText + segmentText; + return state.committed + state.pendingDelta; } // ── Audio buffer capture ────────────────────────────────────────────────── diff --git a/desktop/src/features/dictation/lib/voiceInput.test.mjs b/desktop/src/features/dictation/lib/voiceInput.test.mjs new file mode 100644 index 000000000..f56dcf148 --- /dev/null +++ b/desktop/src/features/dictation/lib/voiceInput.test.mjs @@ -0,0 +1,142 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { + DEFAULT_AUTO_SUBMIT_PHRASE, + getAutoSubmitMatch, + parseAutoSubmitPhrases, + replaceTrailingTranscribedText, +} from "./voiceInput.ts"; + +// ── parseAutoSubmitPhrases ────────────────────────────────────────────────── + +test("parseAutoSubmitPhrases_returnsEmptyForNullish", () => { + assert.deepEqual(parseAutoSubmitPhrases(null), []); + assert.deepEqual(parseAutoSubmitPhrases(undefined), []); + assert.deepEqual(parseAutoSubmitPhrases(""), []); +}); + +test("parseAutoSubmitPhrases_splitsNormalizesAndDedupes", () => { + assert.deepEqual(parseAutoSubmitPhrases("Submit, send it, submit, "), [ + "submit", + "send it", + ]); +}); + +test("parseAutoSubmitPhrases_stripsTrailingPunctuation", () => { + assert.deepEqual(parseAutoSubmitPhrases("submit!"), ["submit"]); +}); + +// ── replaceTrailingTranscribedText ────────────────────────────────────────── + +test("replaceTrailingTranscribedText_appendsWhenNoPrevious", () => { + assert.equal( + replaceTrailingTranscribedText("Hello", "", "world"), + "Hello world", + ); +}); + +test("replaceTrailingTranscribedText_appendsToEmptyBase", () => { + assert.equal(replaceTrailingTranscribedText("", "", "hello"), "hello"); +}); + +test("replaceTrailingTranscribedText_replacesTrailingInterim", () => { + // Interim "hello wor" is refined to "hello world". + assert.equal( + replaceTrailingTranscribedText("hello wor", "hello wor", "hello world"), + "hello world", + ); +}); + +test("replaceTrailingTranscribedText_preservesTextTypedBeforeDictation", () => { + // User typed "Note: " then dictated; the manual prefix must survive. + assert.equal( + replaceTrailingTranscribedText("Note: hi", "hi", "hi there"), + "Note: hi there", + ); +}); + +test("replaceTrailingTranscribedText_appendsWhenPreviousNoLongerMatches", () => { + // If the previous transcript isn't the trailing text anymore, append. + assert.equal( + replaceTrailingTranscribedText("edited text", "old", "new"), + "edited text new", + ); +}); + +test("replaceTrailingTranscribedText_noDoubleSpaceBeforePunctuation", () => { + assert.equal( + replaceTrailingTranscribedText("Hello", "", ", world"), + "Hello, world", + ); +}); + +// ── getAutoSubmitMatch ────────────────────────────────────────────────────── + +test("getAutoSubmitMatch_returnsNullWhenPhraseAbsent", () => { + assert.equal( + getAutoSubmitMatch( + "hello there", + parseAutoSubmitPhrases(DEFAULT_AUTO_SUBMIT_PHRASE), + ), + null, + ); +}); + +test("getAutoSubmitMatch_matchesTrailingPhraseAndStripsIt", () => { + const match = getAutoSubmitMatch( + "send this message submit", + parseAutoSubmitPhrases(DEFAULT_AUTO_SUBMIT_PHRASE), + ); + assert.ok(match); + assert.equal(match.matchedPhrase, "submit"); + assert.equal(match.textWithoutPhrase, "send this message"); +}); + +test("getAutoSubmitMatch_ignoresPhraseMidSentence", () => { + // "submit" is not at the end, so it must not auto-send. + assert.equal( + getAutoSubmitMatch( + "submit the form later", + parseAutoSubmitPhrases(DEFAULT_AUTO_SUBMIT_PHRASE), + ), + null, + ); +}); + +test("getAutoSubmitMatch_requiresWordBoundaryBeforePhrase", () => { + // "resubmit" ends with "submit" but is not a standalone word → no match. + assert.equal( + getAutoSubmitMatch("resubmit", parseAutoSubmitPhrases("submit")), + null, + ); +}); + +test("getAutoSubmitMatch_toleratesTrailingPunctuation", () => { + const match = getAutoSubmitMatch( + "ship it submit.", + parseAutoSubmitPhrases(DEFAULT_AUTO_SUBMIT_PHRASE), + ); + assert.ok(match); + assert.equal(match.textWithoutPhrase, "ship it"); +}); + +test("getAutoSubmitMatch_matchesMultiWordPhrase", () => { + const match = getAutoSubmitMatch( + "please do this send it", + parseAutoSubmitPhrases("send it"), + ); + assert.ok(match); + assert.equal(match.matchedPhrase, "send it"); + assert.equal(match.textWithoutPhrase, "please do this"); +}); + +test("getAutoSubmitMatch_prefersLongestPhrase", () => { + const match = getAutoSubmitMatch( + "text please submit now", + parseAutoSubmitPhrases("submit now, now"), + ); + assert.ok(match); + assert.equal(match.matchedPhrase, "submit now"); + assert.equal(match.textWithoutPhrase, "text please"); +}); diff --git a/desktop/src/features/messages/ui/MessageComposer.tsx b/desktop/src/features/messages/ui/MessageComposer.tsx index e7c441600..eed89f228 100644 --- a/desktop/src/features/messages/ui/MessageComposer.tsx +++ b/desktop/src/features/messages/ui/MessageComposer.tsx @@ -214,15 +214,15 @@ function MessageComposerImpl({ emojiAutocomplete.isEmojiAutocompleteOpen; const submitMessageRef = React.useRef<() => void>(() => {}); - + const setEditorContentRef = React.useRef<(t: string) => void>(() => {}); const dictation = useComposerDictation({ contentRef, disabled, isSending, - setComposerContentFromText, + setComposerContent, + setEditorContentRef, submitMessageRef, }); - const composerScrollRef = React.useRef(null); // Set after `useLinkEditor` exists below; the editor's link-click handler @@ -281,6 +281,7 @@ function MessageComposerImpl({ }, }); + setEditorContentRef.current = richText.setContent; const linkEditor = useLinkEditor(richText); syncContentRefFromEditorRef.current = () => { const markdown = richText.getMarkdown(); From 17830470b8ed5be367601f2df1229222d518e11c Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sat, 4 Jul 2026 16:47:38 +0100 Subject: [PATCH 06/12] fix(dictation): use client_secrets API, track transcripts by item_id - Switch relay from /v1/realtime/sessions to /v1/realtime/client_secrets with the wrapped { session: { ... } } request shape per OpenAI's current WebRTC guide. The old endpoint returns non-2xx, breaking dictation. - Redesign TranscriptSegmentState to track per-item segments keyed by item_id. Completed events for different turns can arrive out of order; reconciling by item_id preserves utterance ordering and prevents text reordering or partial-turn drops during fast consecutive speech. Signed-off-by: klopez4212 --- crates/buzz-relay/src/api/transcribe.rs | 21 ++-- .../dictation/hooks/useRealtimeDictation.ts | 4 +- .../dictation/lib/realtimeAudio.test.mjs | 112 ++++++++++++++---- .../features/dictation/lib/realtimeAudio.ts | 86 ++++++++++---- 4 files changed, 170 insertions(+), 53 deletions(-) diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs index 9210938cc..ff8af8a99 100644 --- a/crates/buzz-relay/src/api/transcribe.rs +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -20,7 +20,8 @@ use crate::state::AppState; use super::api_error; -const OPENAI_REALTIME_SESSIONS_URL: &str = "https://api.openai.com/v1/realtime/sessions"; +const OPENAI_REALTIME_CLIENT_SECRETS_URL: &str = + "https://api.openai.com/v1/realtime/client_secrets"; const DEFAULT_TRANSCRIPTION_MODEL: &str = "whisper-1"; /// Response for `GET /transcribe/status`. @@ -76,17 +77,19 @@ pub async fn create_transcribe_session( let client = reqwest::Client::new(); let response = client - .post(OPENAI_REALTIME_SESSIONS_URL) + .post(OPENAI_REALTIME_CLIENT_SECRETS_URL) .header("Authorization", format!("Bearer {api_key}")) .header("Content-Type", "application/json") .json(&serde_json::json!({ - "model": "gpt-4o-mini-realtime-preview", - "modalities": ["text"], - "input_audio_transcription": { - "model": model, - }, - "turn_detection": { - "type": "server_vad", + "session": { + "model": "gpt-4o-mini-realtime-preview", + "modalities": ["text"], + "input_audio_transcription": { + "model": model, + }, + "turn_detection": { + "type": "server_vad", + } } })) .timeout(std::time::Duration::from_secs(10)) diff --git a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts index ce289e279..a65048d06 100644 --- a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts +++ b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts @@ -15,6 +15,7 @@ import { createPeerConnection, createTranscriptSegmentState, flushAudioBuffer, + getTranscriptText, mergeTranscriptEvent, } from "../lib/realtimeAudio"; @@ -116,8 +117,7 @@ export function useRealtimeDictation({ return; } - const prevText = - segmentStateRef.current.committed + segmentStateRef.current.pendingDelta; + const prevText = getTranscriptText(segmentStateRef.current); const merged = mergeTranscriptEvent(segmentStateRef.current, event); if (merged === prevText) return; diff --git a/desktop/src/features/dictation/lib/realtimeAudio.test.mjs b/desktop/src/features/dictation/lib/realtimeAudio.test.mjs index e2fe52cb5..abfb26ecd 100644 --- a/desktop/src/features/dictation/lib/realtimeAudio.test.mjs +++ b/desktop/src/features/dictation/lib/realtimeAudio.test.mjs @@ -1,11 +1,6 @@ import { describe, it } from "node:test"; import assert from "node:assert/strict"; -// We test the pure logic via dynamic import of the TS source compiled by the -// test runner (vitest/tsx). Since this is an .mjs file run by the Node test -// runner through the desktop vitest config, import the built output or use -// a direct TS import if the runner supports it. - // Inline the logic to keep the test self-contained and avoid bundler issues. const TRANSCRIPT_DELTA_EVENT = "conversation.item.input_audio_transcription.delta"; @@ -13,36 +8,55 @@ const TRANSCRIPT_COMPLETED_EVENT = "conversation.item.input_audio_transcription.completed"; function createTranscriptSegmentState() { - return { committed: "", pendingDelta: "" }; + return { itemOrder: [], items: new Map() }; +} + +function getOrCreateItem(state, itemId) { + let seg = state.items.get(itemId); + if (!seg) { + seg = { pending: "", finalized: null }; + state.items.set(itemId, seg); + state.itemOrder.push(itemId); + } + return seg; } function mergeTranscriptEvent(state, event) { + const itemId = event.item_id ?? "__default__"; + if (event.type === TRANSCRIPT_DELTA_EVENT) { + const seg = getOrCreateItem(state, itemId); const delta = event.delta ?? ""; if (delta) { - state.pendingDelta += delta; + seg.pending += delta; } } else if (event.type === TRANSCRIPT_COMPLETED_EVENT) { - const finalText = event.transcript ?? ""; - const separator = state.committed && finalText ? "" : ""; - state.committed = state.committed + separator + finalText; - state.pendingDelta = ""; + const seg = getOrCreateItem(state, itemId); + seg.finalized = event.transcript ?? ""; } - return state.committed + state.pendingDelta; + let result = ""; + for (const id of state.itemOrder) { + const seg = state.items.get(id); + if (!seg) continue; + result += seg.finalized ?? seg.pending; + } + return result; } describe("mergeTranscriptEvent", () => { - it("accumulates delta events", () => { + it("accumulates delta events for a single item", () => { const state = createTranscriptSegmentState(); const r1 = mergeTranscriptEvent(state, { type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", delta: "hello ", }); assert.equal(r1, "hello "); const r2 = mergeTranscriptEvent(state, { type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", delta: "world", }); assert.equal(r2, "hello world"); @@ -52,59 +66,115 @@ describe("mergeTranscriptEvent", () => { const state = createTranscriptSegmentState(); mergeTranscriptEvent(state, { type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", delta: "hello world", }); - // Completed event carries corrected/punctuated version const result = mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_1", transcript: "Hello, world.", }); assert.equal(result, "Hello, world."); - assert.equal(state.committed, "Hello, world."); - assert.equal(state.pendingDelta, ""); }); - it("handles multiple segments sequentially", () => { + it("handles multiple items in order", () => { const state = createTranscriptSegmentState(); - // First segment + // First item mergeTranscriptEvent(state, { type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", delta: "first ", }); mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_1", transcript: "First. ", }); - // Second segment + // Second item mergeTranscriptEvent(state, { type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_2", delta: "second", }); - assert.equal(state.committed + state.pendingDelta, "First. second"); + assert.equal(state.items.get("item_2").pending, "second"); const result = mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_2", transcript: "Second.", }); assert.equal(result, "First. Second."); }); + it("handles out-of-order completed events by item id", () => { + const state = createTranscriptSegmentState(); + + // Both items start with deltas + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", + delta: "first", + }); + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_2", + delta: "second", + }); + + // item_2 completes before item_1 + mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_2", + transcript: "Second. ", + }); + + // item_1 still shows pending + let result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", + delta: " more", + }); + assert.equal(result, "first moreSecond. "); + + // item_1 finally completes + result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_1", + transcript: "First more. ", + }); + assert.equal(result, "First more. Second. "); + }); + it("does not duplicate text on completed event", () => { const state = createTranscriptSegmentState(); mergeTranscriptEvent(state, { type: TRANSCRIPT_DELTA_EVENT, + item_id: "item_1", delta: "hello world", }); - // Without the fix, this would append: "hello worldHello, world." const result = mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_1", transcript: "Hello, world.", }); assert.equal(result, "Hello, world."); }); + + it("falls back to __default__ when item_id is missing", () => { + const state = createTranscriptSegmentState(); + mergeTranscriptEvent(state, { + type: TRANSCRIPT_DELTA_EVENT, + delta: "no id", + }); + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + transcript: "No id.", + }); + assert.equal(result, "No id."); + }); }); diff --git a/desktop/src/features/dictation/lib/realtimeAudio.ts b/desktop/src/features/dictation/lib/realtimeAudio.ts index bed28a727..4aaaeb148 100644 --- a/desktop/src/features/dictation/lib/realtimeAudio.ts +++ b/desktop/src/features/dictation/lib/realtimeAudio.ts @@ -56,49 +56,93 @@ export async function connectPeerConnection(options: { } /** - * State for tracking the current transcription segment. Completed events - * carry the final text for the same segment that prior deltas built up, - * potentially with corrections/punctuation. We track the delta accumulation - * so we can replace it with the finalized text on completion. + * Per-item tracking for a single transcription turn. OpenAI's Realtime API + * sends delta and completed events tagged with an `item_id`; completed events + * for different turns can arrive out of order, so we reconcile by item id. + */ +interface ItemSegment { + /** Accumulated delta text (replaced by finalized on completion). */ + pending: string; + /** Finalized text (set once the completed event arrives). */ + finalized: string | null; +} + +/** + * State for tracking transcription segments keyed by item id. + * Maintains insertion order so the full transcript is reconstructed + * in the order items were first seen. */ export interface TranscriptSegmentState { - /** Text committed from previous (completed) segments. */ - committed: string; - /** Accumulated delta text for the in-progress segment. */ - pendingDelta: string; + /** Ordered item ids (insertion order = utterance order). */ + itemOrder: string[]; + /** Per-item segment data. */ + items: Map; } export function createTranscriptSegmentState(): TranscriptSegmentState { - return { committed: "", pendingDelta: "" }; + return { itemOrder: [], items: new Map() }; +} + +/** Get the current full transcript text from segment state. */ +export function getTranscriptText(state: TranscriptSegmentState): string { + let result = ""; + for (const id of state.itemOrder) { + const seg = state.items.get(id); + if (!seg) continue; + result += seg.finalized ?? seg.pending; + } + return result; +} + +/** Internal: get or create the segment for an item. */ +function getOrCreateItem( + state: TranscriptSegmentState, + itemId: string, +): ItemSegment { + let seg = state.items.get(itemId); + if (!seg) { + seg = { pending: "", finalized: null }; + state.items.set(itemId, seg); + state.itemOrder.push(itemId); + } + return seg; } /** - * Merge a transcript event into the segment state. + * Merge a transcript event into the segment state, keyed by `item_id`. * - * - Delta events: append to `pendingDelta`. - * - Completed events: replace `pendingDelta` with the finalized transcript, - * then commit it (move to `committed` and reset `pendingDelta`). + * - Delta events: append to the item's `pending` text. + * - Completed events: store `finalized` text, replacing accumulated deltas. * - * Returns the full merged text (committed + pending). + * Returns the full merged text across all items in order. */ export function mergeTranscriptEvent( state: TranscriptSegmentState, event: TranscriptEvent, ): string { + // Use item_id from the event; fall back to a synthetic key for events + // that lack one (shouldn't happen in practice, but be defensive). + const itemId = event.item_id ?? "__default__"; + if (event.type === TRANSCRIPT_DELTA_EVENT) { + const seg = getOrCreateItem(state, itemId); const delta = event.delta ?? ""; if (delta) { - state.pendingDelta += delta; + seg.pending += delta; } } else if (event.type === TRANSCRIPT_COMPLETED_EVENT) { - const finalText = event.transcript ?? ""; - // Replace the accumulated deltas with the finalized text, then commit. - const separator = state.committed && finalText ? "" : ""; - state.committed = state.committed + separator + finalText; - state.pendingDelta = ""; + const seg = getOrCreateItem(state, itemId); + seg.finalized = event.transcript ?? ""; } - return state.committed + state.pendingDelta; + // Reconstruct full text from all items in order. + let result = ""; + for (const id of state.itemOrder) { + const seg = state.items.get(id); + if (!seg) continue; + result += seg.finalized ?? seg.pending; + } + return result; } // ── Audio buffer capture ────────────────────────────────────────────────── From 422b8556b5bf7e4eead2ab3abf0726b417b0213f Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sun, 5 Jul 2026 06:59:53 +0100 Subject: [PATCH 07/12] fix(dictation): typed transcription session, sync editor before merge, block sends during upload - Use OpenAI typed transcription session format (type: "transcription") instead of legacy realtime fields that would fail or produce no transcripts - Sync editor content via syncContentRef before merging dictation text so manually typed prefixes are preserved when dictation starts - Read send-blocked state from refs at transcript time so uploads prevent auto-submit from clearing the composer Signed-off-by: klopez4212 --- crates/buzz-relay/src/api/transcribe.rs | 3 +-- .../dictation/hooks/useComposerDictation.ts | 26 ++++++++++++------- .../features/dictation/hooks/useDictation.ts | 24 +++++++---------- .../features/messages/ui/MessageComposer.tsx | 10 +++---- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs index ff8af8a99..f66c584e4 100644 --- a/crates/buzz-relay/src/api/transcribe.rs +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -82,8 +82,7 @@ pub async fn create_transcribe_session( .header("Content-Type", "application/json") .json(&serde_json::json!({ "session": { - "model": "gpt-4o-mini-realtime-preview", - "modalities": ["text"], + "type": "transcription", "input_audio_transcription": { "model": model, }, diff --git a/desktop/src/features/dictation/hooks/useComposerDictation.ts b/desktop/src/features/dictation/hooks/useComposerDictation.ts index d907062be..808787059 100644 --- a/desktop/src/features/dictation/hooks/useComposerDictation.ts +++ b/desktop/src/features/dictation/hooks/useComposerDictation.ts @@ -1,10 +1,13 @@ import type * as React from "react"; +import { useRef } from "react"; import { useDictation } from "./useDictation"; interface UseComposerDictationOptions { - contentRef: React.MutableRefObject; - disabled: boolean; - isSending: boolean; + /** Ref to a function that syncs contentRef from the Tiptap editor and returns it. */ + syncContentRef: React.MutableRefObject<() => string>; + disabledRef: React.MutableRefObject; + isSendingRef: React.MutableRefObject; + isUploadingRef: React.MutableRefObject; /** Updates contentRef + isContentEmpty state. */ setComposerContent: (text: string) => void; /** Ref to a function that updates the Tiptap editor document. */ @@ -14,18 +17,23 @@ interface UseComposerDictationOptions { /** * Thin wrapper around `useDictation` pre-wired for the MessageComposer's - * state management (contentRef, setComposerContent, editor, submitMessageRef). + * state management (syncContentRef, setComposerContent, editor, submitMessageRef). */ export function useComposerDictation({ - contentRef, - disabled, - isSending, + syncContentRef, + disabledRef, + isSendingRef, + isUploadingRef, setComposerContent, setEditorContentRef, submitMessageRef, }: UseComposerDictationOptions) { + const isSendBlockedRef = useRef(false); + isSendBlockedRef.current = + disabledRef.current || isSendingRef.current || isUploadingRef.current; + return useDictation({ - text: contentRef.current, + getText: () => syncContentRef.current(), setText: (text) => { setComposerContent(text); setEditorContentRef.current(text); @@ -38,6 +46,6 @@ export function useComposerDictation({ // holds the dictated text. submitMessageRef.current(); }, - sendDisabled: disabled || isSending, + isSendBlockedRef, }); } diff --git a/desktop/src/features/dictation/hooks/useDictation.ts b/desktop/src/features/dictation/hooks/useDictation.ts index abf687704..ad42ef569 100644 --- a/desktop/src/features/dictation/hooks/useDictation.ts +++ b/desktop/src/features/dictation/hooks/useDictation.ts @@ -1,3 +1,4 @@ +import type * as React from "react"; import { useCallback, useMemo, useRef } from "react"; import { DEFAULT_AUTO_SUBMIT_PHRASE, @@ -8,35 +9,33 @@ import { import { useRealtimeDictation } from "./useRealtimeDictation"; interface UseDictationOptions { - /** Current composer text */ - text: string; + /** Returns the current composer text (must be fresh — synced from editor). */ + getText: () => string; /** Set composer text */ setText: (value: string) => void; /** Send the message */ onSend: (text: string) => void; - /** Whether sending is currently blocked */ - sendDisabled?: boolean; + /** Ref that is `true` when sending is blocked (uploading, preparing mention, etc.) */ + isSendBlockedRef?: React.MutableRefObject; } export function useDictation({ - text, + getText, setText, onSend, - sendDisabled = false, + isSendBlockedRef, }: UseDictationOptions) { const autoSubmitPhrases = useMemo( () => parseAutoSubmitPhrases(DEFAULT_AUTO_SUBMIT_PHRASE), [], ); const stopRecordingRef = useRef<() => void>(() => {}); - const textRef = useRef(text); - textRef.current = text; const lastTranscriptRef = useRef(""); const handleTranscript = useCallback( (transcript: string) => { const previous = lastTranscriptRef.current; - const latest = textRef.current; + const latest = getText(); const merged = replaceTrailingTranscribedText( latest, previous, @@ -46,7 +45,6 @@ export function useDictation({ if (!match) { setText(merged); - textRef.current = merged; lastTranscriptRef.current = transcript; return; } @@ -60,18 +58,16 @@ export function useDictation({ stopRecordingRef.current(); - if (sendDisabled) { + if (isSendBlockedRef?.current) { setText(textWithoutPhrase); - textRef.current = textWithoutPhrase; return; } onSend(textWithoutPhrase.trim()); setText(""); - textRef.current = ""; lastTranscriptRef.current = ""; }, - [autoSubmitPhrases, onSend, sendDisabled, setText], + [autoSubmitPhrases, getText, onSend, isSendBlockedRef, setText], ); const dictation = useRealtimeDictation({ diff --git a/desktop/src/features/messages/ui/MessageComposer.tsx b/desktop/src/features/messages/ui/MessageComposer.tsx index eed89f228..e7c17165c 100644 --- a/desktop/src/features/messages/ui/MessageComposer.tsx +++ b/desktop/src/features/messages/ui/MessageComposer.tsx @@ -214,17 +214,17 @@ function MessageComposerImpl({ emojiAutocomplete.isEmojiAutocompleteOpen; const submitMessageRef = React.useRef<() => void>(() => {}); - const setEditorContentRef = React.useRef<(t: string) => void>(() => {}); + const setEditorContentRef = React.useRef<(text: string) => void>(() => {}); const dictation = useComposerDictation({ - contentRef, - disabled, - isSending, + syncContentRef: syncContentRefFromEditorRef, + disabledRef, + isSendingRef, + isUploadingRef, setComposerContent, setEditorContentRef, submitMessageRef, }); const composerScrollRef = React.useRef(null); - // Set after `useLinkEditor` exists below; the editor's link-click handler // delegates through this ref to break the hook ordering cycle (the editor // needs `onEditLink`, but the link editor needs the editor's `richText`). From 7af4eeebe872bd2c0b38a7cc6381b28cdecd191e Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sun, 5 Jul 2026 07:22:42 +0100 Subject: [PATCH 08/12] fix(dictation): stop recording on channel/thread switch When the composer's draftKey changes (channel or thread switch), stop any active dictation session so transcript events from a stale WebRTC connection don't leak into the wrong draft. Signed-off-by: klopez4212 --- .../dictation/hooks/useComposerDictation.ts | 16 ++++++++++++++-- .../src/features/messages/ui/MessageComposer.tsx | 2 +- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/desktop/src/features/dictation/hooks/useComposerDictation.ts b/desktop/src/features/dictation/hooks/useComposerDictation.ts index 808787059..b945e68a3 100644 --- a/desktop/src/features/dictation/hooks/useComposerDictation.ts +++ b/desktop/src/features/dictation/hooks/useComposerDictation.ts @@ -1,5 +1,5 @@ import type * as React from "react"; -import { useRef } from "react"; +import { useEffect, useRef } from "react"; import { useDictation } from "./useDictation"; interface UseComposerDictationOptions { @@ -13,6 +13,8 @@ interface UseComposerDictationOptions { /** Ref to a function that updates the Tiptap editor document. */ setEditorContentRef: React.MutableRefObject<(text: string) => void>; submitMessageRef: React.MutableRefObject<() => void>; + /** When this key changes (channel/thread switch), active dictation is stopped. */ + draftKey?: string | null; } /** @@ -27,12 +29,13 @@ export function useComposerDictation({ setComposerContent, setEditorContentRef, submitMessageRef, + draftKey, }: UseComposerDictationOptions) { const isSendBlockedRef = useRef(false); isSendBlockedRef.current = disabledRef.current || isSendingRef.current || isUploadingRef.current; - return useDictation({ + const dictation = useDictation({ getText: () => syncContentRef.current(), setText: (text) => { setComposerContent(text); @@ -48,4 +51,13 @@ export function useComposerDictation({ }, isSendBlockedRef, }); + + // Stop dictation when the channel/thread changes so that transcript events + // from a stale WebRTC session don't leak into the wrong draft. + // biome-ignore lint/correctness/useExhaustiveDependencies: draftKey is the sole trigger + useEffect(() => { + dictation.stopRecording(); + }, [draftKey]); + + return dictation; } diff --git a/desktop/src/features/messages/ui/MessageComposer.tsx b/desktop/src/features/messages/ui/MessageComposer.tsx index e7c17165c..4c3d49d8f 100644 --- a/desktop/src/features/messages/ui/MessageComposer.tsx +++ b/desktop/src/features/messages/ui/MessageComposer.tsx @@ -223,6 +223,7 @@ function MessageComposerImpl({ setComposerContent, setEditorContentRef, submitMessageRef, + draftKey: effectiveDraftKey, }); const composerScrollRef = React.useRef(null); // Set after `useLinkEditor` exists below; the editor's link-click handler @@ -995,5 +996,4 @@ function MessageComposerImpl({ ); } - export const MessageComposer = React.memo(MessageComposerImpl); From 0636ead16d759ea9baeef1ad5f56b5f912dda8da Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sun, 5 Jul 2026 08:12:56 +0100 Subject: [PATCH 09/12] =?UTF-8?q?fix:=20address=20review=20comments=20?= =?UTF-8?q?=E2=80=94=20nested=20audio=20schema,=20item=20separators,=20saf?= =?UTF-8?q?e=20auto-submit=20clear?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Relay: restructure OpenAI client-secrets payload to use the current typed transcription schema (audio.input.transcription) instead of the deprecated top-level input_audio_transcription field. - realtimeAudio: insert space separators between transcript items when neither the preceding nor following text has whitespace, preventing multi-utterance runs from merging into unreadable text. - useDictation: remove premature setText('') after auto-submit — the send flow handles clearing on success, so dictated text survives if a mention dialog opens or the send is blocked. Signed-off-by: klopez4212 --- crates/buzz-relay/src/api/transcribe.rs | 16 +++++++++------- .../src/features/dictation/hooks/useDictation.ts | 7 ++++++- .../src/features/dictation/lib/realtimeAudio.ts | 15 +++++++-------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs index f66c584e4..97263a87f 100644 --- a/crates/buzz-relay/src/api/transcribe.rs +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -81,14 +81,16 @@ pub async fn create_transcribe_session( .header("Authorization", format!("Bearer {api_key}")) .header("Content-Type", "application/json") .json(&serde_json::json!({ - "session": { - "type": "transcription", - "input_audio_transcription": { - "model": model, - }, - "turn_detection": { - "type": "server_vad", + "type": "transcription", + "audio": { + "input": { + "transcription": { + "model": model, + } } + }, + "turn_detection": { + "type": "server_vad", } })) .timeout(std::time::Duration::from_secs(10)) diff --git a/desktop/src/features/dictation/hooks/useDictation.ts b/desktop/src/features/dictation/hooks/useDictation.ts index ad42ef569..5e21fed16 100644 --- a/desktop/src/features/dictation/hooks/useDictation.ts +++ b/desktop/src/features/dictation/hooks/useDictation.ts @@ -63,8 +63,13 @@ export function useDictation({ return; } + // Set the text first so the composer shows the final dictated content, + // then trigger send. We intentionally do NOT clear the composer here — + // the send flow in MessageComposer handles clearing on successful send. + // If a mention dialog opens (non-member mention), the text stays in the + // composer so the user doesn't lose their dictated message. + setText(textWithoutPhrase.trim()); onSend(textWithoutPhrase.trim()); - setText(""); lastTranscriptRef.current = ""; }, [autoSubmitPhrases, getText, onSend, isSendBlockedRef, setText], diff --git a/desktop/src/features/dictation/lib/realtimeAudio.ts b/desktop/src/features/dictation/lib/realtimeAudio.ts index 4aaaeb148..2ad6e010d 100644 --- a/desktop/src/features/dictation/lib/realtimeAudio.ts +++ b/desktop/src/features/dictation/lib/realtimeAudio.ts @@ -89,7 +89,12 @@ export function getTranscriptText(state: TranscriptSegmentState): string { for (const id of state.itemOrder) { const seg = state.items.get(id); if (!seg) continue; - result += seg.finalized ?? seg.pending; + const text = seg.finalized ?? seg.pending; + if (!text) continue; + if (result && !result.endsWith(" ") && !text.startsWith(" ")) { + result += " "; + } + result += text; } return result; } @@ -136,13 +141,7 @@ export function mergeTranscriptEvent( } // Reconstruct full text from all items in order. - let result = ""; - for (const id of state.itemOrder) { - const seg = state.items.get(id); - if (!seg) continue; - result += seg.finalized ?? seg.pending; - } - return result; + return getTranscriptText(state); } // ── Audio buffer capture ────────────────────────────────────────────────── From 7d498da598147240a9dfb86ff1b92ced29cefd1a Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sun, 5 Jul 2026 09:08:16 +0100 Subject: [PATCH 10/12] fix(e2e): use keyboard.type for Tiptap edit test reliability Playwright's fill() on a contenteditable doesn't reliably update Tiptap's internal ProseMirror document model, causing getMarkdown() to return stale content when Enter fires immediately after. Replace clear()+fill() with select-all + keyboard.type() which triggers proper input events that Tiptap's transaction pipeline processes synchronously. Fixes the consistently flaky 'owner can edit their owned agent's message' test (also broken on main). Signed-off-by: klopez4212 --- desktop/tests/e2e/human-edit-agent-content.spec.ts | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/desktop/tests/e2e/human-edit-agent-content.spec.ts b/desktop/tests/e2e/human-edit-agent-content.spec.ts index 93bc13621..0994fce23 100644 --- a/desktop/tests/e2e/human-edit-agent-content.spec.ts +++ b/desktop/tests/e2e/human-edit-agent-content.spec.ts @@ -99,11 +99,15 @@ test("owner can edit their owned agent's message", async ({ page }) => { // Edit banner must appear confirming edit mode is active. await expect(page.getByTestId("edit-target")).toBeVisible({ timeout: 5_000 }); - // Clear the composer and type the new content, then submit. + // Wait for the editor to be populated with the original message content + // (edit mode calls richText.setContent which is async in Tiptap's + // transaction pipeline). Then select-all and type the replacement. const input = page.getByTestId("message-input"); - await input.clear(); - await input.fill(editedContent); - await input.press("Enter"); + await expect(input).not.toBeEmpty({ timeout: 5_000 }); + await input.click(); + await page.keyboard.press("ControlOrMeta+A"); + await page.keyboard.type(editedContent); + await page.keyboard.press("Enter"); // Edit mode must exit (banner gone) and the updated content must render. await expect(page.getByTestId("edit-target")).toBeHidden(); From 6d4f2ebbe23908e94e306ffd1acd0b0da306025d Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sun, 5 Jul 2026 09:20:14 +0100 Subject: [PATCH 11/12] fix(relay): wrap transcription config in session object The OpenAI client_secrets endpoint expects the body as { session: { type, audio: { input: { transcription, turn_detection } } } } not as top-level fields. Also moves turn_detection under audio.input per the Realtime transcription guide. Signed-off-by: klopez4212 --- crates/buzz-relay/src/api/transcribe.rs | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/buzz-relay/src/api/transcribe.rs b/crates/buzz-relay/src/api/transcribe.rs index 97263a87f..1e83be13e 100644 --- a/crates/buzz-relay/src/api/transcribe.rs +++ b/crates/buzz-relay/src/api/transcribe.rs @@ -81,16 +81,18 @@ pub async fn create_transcribe_session( .header("Authorization", format!("Bearer {api_key}")) .header("Content-Type", "application/json") .json(&serde_json::json!({ - "type": "transcription", - "audio": { - "input": { - "transcription": { - "model": model, + "session": { + "type": "transcription", + "audio": { + "input": { + "transcription": { + "model": model, + }, + "turn_detection": { + "type": "server_vad", + } } } - }, - "turn_detection": { - "type": "server_vad", } })) .timeout(std::time::Duration::from_secs(10)) From 5ce09acd753b12adb99e088464c11c40df9aba4b Mon Sep 17 00:00:00 2001 From: klopez4212 Date: Sun, 5 Jul 2026 09:33:05 +0100 Subject: [PATCH 12/12] fix(dictation): preserve transcript item order from committed events Handle input_audio_buffer.committed events to register items in the correct utterance order using previous_item_id before any transcript events arrive. This ensures that when completions for different turns arrive out of order (or when only completions are sent without deltas), the composer reconstructs multi-utterance dictation in the correct sequence rather than event-arrival order. Added tests for committed-order preservation, out-of-order completions with pre-registered order, and completion-only flows. Signed-off-by: klopez4212 --- .../dictation/hooks/useRealtimeDictation.ts | 4 +- .../dictation/lib/realtimeAudio.test.mjs | 158 ++++++++++++++++-- .../features/dictation/lib/realtimeAudio.ts | 29 +++- 3 files changed, 172 insertions(+), 19 deletions(-) diff --git a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts index a65048d06..a607f1515 100644 --- a/desktop/src/features/dictation/hooks/useRealtimeDictation.ts +++ b/desktop/src/features/dictation/hooks/useRealtimeDictation.ts @@ -8,6 +8,7 @@ import { type AudioBufferCapture, type TranscriptEvent, type TranscriptSegmentState, + BUFFER_COMMITTED_EVENT, TRANSCRIPT_COMPLETED_EVENT, TRANSCRIPT_DELTA_EVENT, connectPeerConnection, @@ -112,7 +113,8 @@ export function useRealtimeDictation({ if ( event.type !== TRANSCRIPT_DELTA_EVENT && - event.type !== TRANSCRIPT_COMPLETED_EVENT + event.type !== TRANSCRIPT_COMPLETED_EVENT && + event.type !== BUFFER_COMMITTED_EVENT ) { return; } diff --git a/desktop/src/features/dictation/lib/realtimeAudio.test.mjs b/desktop/src/features/dictation/lib/realtimeAudio.test.mjs index abfb26ecd..adf0ca663 100644 --- a/desktop/src/features/dictation/lib/realtimeAudio.test.mjs +++ b/desktop/src/features/dictation/lib/realtimeAudio.test.mjs @@ -6,25 +6,52 @@ const TRANSCRIPT_DELTA_EVENT = "conversation.item.input_audio_transcription.delta"; const TRANSCRIPT_COMPLETED_EVENT = "conversation.item.input_audio_transcription.completed"; +const BUFFER_COMMITTED_EVENT = "input_audio_buffer.committed"; function createTranscriptSegmentState() { return { itemOrder: [], items: new Map() }; } -function getOrCreateItem(state, itemId) { +function getOrCreateItem(state, itemId, previousItemId) { let seg = state.items.get(itemId); if (!seg) { seg = { pending: "", finalized: null }; state.items.set(itemId, seg); - state.itemOrder.push(itemId); + if (previousItemId) { + const prevIndex = state.itemOrder.indexOf(previousItemId); + if (prevIndex !== -1) { + state.itemOrder.splice(prevIndex + 1, 0, itemId); + } else { + state.itemOrder.push(itemId); + } + } else { + state.itemOrder.push(itemId); + } } return seg; } +function getTranscriptText(state) { + let result = ""; + for (const id of state.itemOrder) { + const seg = state.items.get(id); + if (!seg) continue; + const text = seg.finalized ?? seg.pending; + if (!text) continue; + if (result && !result.endsWith(" ") && !text.startsWith(" ")) { + result += " "; + } + result += text; + } + return result; +} + function mergeTranscriptEvent(state, event) { const itemId = event.item_id ?? "__default__"; - if (event.type === TRANSCRIPT_DELTA_EVENT) { + if (event.type === BUFFER_COMMITTED_EVENT) { + getOrCreateItem(state, itemId, event.previous_item_id ?? undefined); + } else if (event.type === TRANSCRIPT_DELTA_EVENT) { const seg = getOrCreateItem(state, itemId); const delta = event.delta ?? ""; if (delta) { @@ -35,13 +62,7 @@ function mergeTranscriptEvent(state, event) { seg.finalized = event.transcript ?? ""; } - let result = ""; - for (const id of state.itemOrder) { - const seg = state.items.get(id); - if (!seg) continue; - result += seg.finalized ?? seg.pending; - } - return result; + return getTranscriptText(state); } describe("mergeTranscriptEvent", () => { @@ -90,7 +111,7 @@ describe("mergeTranscriptEvent", () => { mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, item_id: "item_1", - transcript: "First. ", + transcript: "First.", }); // Second item @@ -128,7 +149,7 @@ describe("mergeTranscriptEvent", () => { mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, item_id: "item_2", - transcript: "Second. ", + transcript: "Second.", }); // item_1 still shows pending @@ -137,15 +158,15 @@ describe("mergeTranscriptEvent", () => { item_id: "item_1", delta: " more", }); - assert.equal(result, "first moreSecond. "); + assert.equal(result, "first more Second."); // item_1 finally completes result = mergeTranscriptEvent(state, { type: TRANSCRIPT_COMPLETED_EVENT, item_id: "item_1", - transcript: "First more. ", + transcript: "First more.", }); - assert.equal(result, "First more. Second. "); + assert.equal(result, "First more. Second."); }); it("does not duplicate text on completed event", () => { @@ -177,4 +198,111 @@ describe("mergeTranscriptEvent", () => { }); assert.equal(result, "No id."); }); + + it("preserves committed item order when completions arrive out of order", () => { + const state = createTranscriptSegmentState(); + + // Server commits items in order: item_1 then item_2 + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_1", + previous_item_id: null, + }); + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_2", + previous_item_id: "item_1", + }); + + // But item_2's completion arrives first + mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_2", + transcript: "Second.", + }); + + // Then item_1's completion arrives + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_1", + transcript: "First.", + }); + + // Order must follow committed order, not arrival order + assert.equal(result, "First. Second."); + }); + + it("inserts item after previous_item_id even if later items already exist", () => { + const state = createTranscriptSegmentState(); + + // Commit item_1 + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_1", + }); + + // Commit item_3 (item_2 not yet committed) + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_3", + previous_item_id: "item_1", + }); + + // Now commit item_2 between item_1 and item_3 + // (previous_item_id = item_1, so it goes after item_1) + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_2", + previous_item_id: "item_1", + }); + + // Add transcripts + mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_1", + transcript: "One.", + }); + mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_2", + transcript: "Two.", + }); + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_3", + transcript: "Three.", + }); + + // item_2 was inserted after item_1 (before item_3) + assert.equal(result, "One. Two. Three."); + }); + + it("handles completion-only flow with committed order", () => { + const state = createTranscriptSegmentState(); + + // Server commits both items + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_A", + }); + mergeTranscriptEvent(state, { + type: BUFFER_COMMITTED_EVENT, + item_id: "item_B", + previous_item_id: "item_A", + }); + + // Only completed events arrive (no deltas) — in reverse order + mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_B", + transcript: "Bravo.", + }); + const result = mergeTranscriptEvent(state, { + type: TRANSCRIPT_COMPLETED_EVENT, + item_id: "item_A", + transcript: "Alpha.", + }); + + assert.equal(result, "Alpha. Bravo."); + }); }); diff --git a/desktop/src/features/dictation/lib/realtimeAudio.ts b/desktop/src/features/dictation/lib/realtimeAudio.ts index 2ad6e010d..afac2ad33 100644 --- a/desktop/src/features/dictation/lib/realtimeAudio.ts +++ b/desktop/src/features/dictation/lib/realtimeAudio.ts @@ -9,12 +9,14 @@ export const TRANSCRIPT_DELTA_EVENT = "conversation.item.input_audio_transcription.delta"; export const TRANSCRIPT_COMPLETED_EVENT = "conversation.item.input_audio_transcription.completed"; +export const BUFFER_COMMITTED_EVENT = "input_audio_buffer.committed"; const MAX_BUFFER_CHUNKS = 500; // ~10s at 20ms per chunk export type TranscriptEvent = { type?: string; item_id?: string; + previous_item_id?: string; content_index?: number; delta?: string; transcript?: string; @@ -99,16 +101,32 @@ export function getTranscriptText(state: TranscriptSegmentState): string { return result; } -/** Internal: get or create the segment for an item. */ +/** + * Internal: get or create the segment for an item. + * When `previousItemId` is provided (from committed events), the new item is + * inserted after that item in `itemOrder` to preserve the server's utterance + * order — even if transcript events for a later item arrive first. + */ function getOrCreateItem( state: TranscriptSegmentState, itemId: string, + previousItemId?: string, ): ItemSegment { let seg = state.items.get(itemId); if (!seg) { seg = { pending: "", finalized: null }; state.items.set(itemId, seg); - state.itemOrder.push(itemId); + if (previousItemId) { + const prevIndex = state.itemOrder.indexOf(previousItemId); + if (prevIndex !== -1) { + state.itemOrder.splice(prevIndex + 1, 0, itemId); + } else { + // Previous item not yet seen — append (best effort). + state.itemOrder.push(itemId); + } + } else { + state.itemOrder.push(itemId); + } } return seg; } @@ -116,6 +134,8 @@ function getOrCreateItem( /** * Merge a transcript event into the segment state, keyed by `item_id`. * + * - Committed events: register the item in the correct order using + * `previous_item_id` from the server, before any transcript arrives. * - Delta events: append to the item's `pending` text. * - Completed events: store `finalized` text, replacing accumulated deltas. * @@ -129,7 +149,10 @@ export function mergeTranscriptEvent( // that lack one (shouldn't happen in practice, but be defensive). const itemId = event.item_id ?? "__default__"; - if (event.type === TRANSCRIPT_DELTA_EVENT) { + if (event.type === BUFFER_COMMITTED_EVENT) { + // Register the item in the correct position before transcripts arrive. + getOrCreateItem(state, itemId, event.previous_item_id ?? undefined); + } else if (event.type === TRANSCRIPT_DELTA_EVENT) { const seg = getOrCreateItem(state, itemId); const delta = event.delta ?? ""; if (delta) {