From 9bed98fb9f338a0fa4c02d578bce14894a63ee33 Mon Sep 17 00:00:00 2001 From: James Walker Date: Wed, 18 Mar 2026 13:16:32 +0000 Subject: [PATCH 1/4] Point quickstart to new Python client --- .../realtime/assets/sm-rt-example.py | 60 ++++++ docs/speech-to-text/realtime/quickstart.mdx | 196 +++++++++++++----- 2 files changed, 204 insertions(+), 52 deletions(-) create mode 100644 docs/speech-to-text/realtime/assets/sm-rt-example.py diff --git a/docs/speech-to-text/realtime/assets/sm-rt-example.py b/docs/speech-to-text/realtime/assets/sm-rt-example.py new file mode 100644 index 00000000..e2defd3d --- /dev/null +++ b/docs/speech-to-text/realtime/assets/sm-rt-example.py @@ -0,0 +1,60 @@ +import asyncio +from speechmatics.rt import ( + AudioEncoding, AudioFormat, AuthenticationError, + Microphone, ServerMessageType, TranscriptResult, + TranscriptionConfig, AsyncClient, +) + +API_KEY = YOUR_API_KEY + +# Set up config and format for transcription +audio_format = AudioFormat( + encoding=AudioEncoding.PCM_S16LE, + sample_rate=16000, + chunk_size=4096, +) +config = TranscriptionConfig( + language="en", + max_delay=0.7, +) + +async def main(): + + # Set up microphone + mic = Microphone( + sample_rate=audio_format.sample_rate, + chunk_size=audio_format.chunk_size + ) + if not mic.start(): + print("Mic not started — please install PyAudio") + + try: + async with AsyncClient(api_key=API_KEY) as client: + # Handle ADD_TRANSCRIPT message + @client.on(ServerMessageType.ADD_TRANSCRIPT) + def handle_finals(msg): + if final := TranscriptResult.from_message(msg).metadata.transcript: + print(f"[Final]: {final}") + + try: + # Begin transcribing + await client.start_session( + transcription_config=config, + audio_format=audio_format + ) + while True: + await client.send_audio( + await mic.read( + chunk_size=audio_format.chunk_size + ) + ) + except KeyboardInterrupt: + pass + finally: + mic.stop() + + except AuthenticationError as e: + print(f"Auth error: {e}") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/speech-to-text/realtime/quickstart.mdx b/docs/speech-to-text/realtime/quickstart.mdx index b131207a..55fc31ee 100644 --- a/docs/speech-to-text/realtime/quickstart.mdx +++ b/docs/speech-to-text/realtime/quickstart.mdx @@ -1,98 +1,190 @@ --- -description: Learn how to convert streaming audio to text. +pagination_prev: null +pagination_next: null +description: Learn how to transcribe streaming audio to text in real time. --- import Admonition from '@theme/Admonition'; import CodeBlock from '@theme/CodeBlock'; import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; +import { Grid } from '@radix-ui/themes'; +import { LinkCard } from "@site/src/theme/LinkCard"; +import { Users, BookMarked, Zap, Mic, Radio, Clock } from 'lucide-react'; import javascriptRadioExample from "./assets/javascript-radio-example.js?raw" -import pythonRadioExample from "./assets/url-example.py?raw" +import pythonRtExample from "./assets/sm-rt-example.py?raw" # Quickstart :::tip -The easiest way to try Realtime transcription is via the [web portal](https://portal.speechmatics.com/jobs/create/real-time). +The quickest way to try real-time transcription is via the [web portal](https://portal.speechmatics.com/jobs/create/real-time) — no code required. ::: -## Using the Realtime SaaS webSocket API +## Using the Realtime API + +The Realtime API streams audio over a WebSocket connection and returns transcript results as you speak. Unlike the [Batch API](/speech-to-text/batch/quickstart), results arrive continuously — within milliseconds of the spoken words. ### 1. Create an API key -[Create an API key in the portal here](https://portal.speechmatics.com/settings/api-keys), which you'll use to securely access the API. -Store the key as a managed secret. +[Create an API key in the portal](https://portal.speechmatics.com/settings/api-keys), which you'll use to securely access the API. Store the key as a managed secret. :::info Enterprise customers may need to speak to [Support](https://support.speechmatics.com) to get your API keys. ::: -### 2. Pick and install a library - -Check out our [JavaScript client](https://www.npmjs.com/package/@speechmatics/real-time-client) or [Python client](https://pypi.org/project/speechmatics-python/) to get started. +### 2. Install the library - - ``` - npm install @speechmatics/real-time-client @speechmatics/auth - ``` - - ``` - pip3 install speechmatics-python - ``` + Install using pip: + ``` + pip install speechmatics-rt pyaudio + ``` + :::note + `pyaudio` is required for microphone input in this quickstart. + ::: + + + Install using npm: + ``` + npm install @speechmatics/real-time-client @speechmatics/auth + ``` +### 3. Run the example -### 3. Insert your API key - -Paste your API key into `YOUR_API_KEY` in the code. +Replace `YOUR_API_KEY` with your key, then run the script. + + + {pythonRtExample} + + Speak into your microphone. You should see output like: + ``` + [Final]: Hello, welcome to Speechmatics. + [Final]: This is a real-time transcription example. + ``` + Press `Ctrl+C` to stop. + {javascriptRadioExample} - - - - {pythonRadioExample} - + This example transcribes a live radio stream. You should see a rolling transcript printed to the console. + Press `Ctrl+C` to stop. +## Understanding the output +The API returns two types of transcript results. You can use either or both depending on your use case. -## Transcript outputs +| Type | Latency | Stability | Best for | +|------|---------|-----------|----------| +| **Final** | ~0.7–2s | Definitive, never revised | Accurate transcripts, subtitles | +| **Partial** | <500ms | May be revised | Live captions, voice interfaces | -The API returns transcripts in JSON format. You can receive two types of output: [Final](#final-transcripts) and [Partial](#partial-transcripts) transcripts. Choose the type based on your latency and accuracy needs. +**Finals** represent the best transcription for a span of audio and are never updated once emitted. You can tune their latency using [`max_delay`](/speech-to-text/realtime/output#latency) — lower values reduce delay at the cost of slight accuracy. -### Final transcripts +**Partials** are emitted immediately as audio arrives and may be revised as more context is processed. A common pattern is to display partials immediately, then replace them with finals as they arrive. -Final transcripts are the definitive result. -- They reflect the best transcription for the spoken audio. -- Once displayed, they are not updated. -- Words arrive incrementally, with some delay. +To receive partials, set `enable_partials=True` in your `TranscriptionConfig` and register a handler for `ADD_PARTIAL_TRANSCRIPT`: -You control the latency and accuracy tradeoff [using the `max_delay` setting](/speech-to-text/realtime/output#latency) in your `transcription_config`. -Larger values of `max_delay` increase accuracy by giving the system more time to process audio context. - -:::tip -Best for accurate, completed transcripts where some delay is acceptable -::: - -### Partial transcripts - -Partial transcripts are low-latency and can update later as more conversation context arrives. -- You must enable them using `enable_partials` in your `transcription_config`. -- Partials are emitted quickly (typically less than 500ms). -- The engine may revise them as more audio is processed. - -You can combine partials with finals for a responsive user experience — show partials first, then replace them with finals as they arrive. - -You control the latency and accuracy tradeoff using the [`max_delay` setting](/speech-to-text/realtime/output#latency) in your `transcription_config`. + + + ```python + config = TranscriptionConfig( + language="en", + max_delay=0.7, + enable_partials=True, # Enable partial transcripts + ) + + async with AsyncClient(api_key=API_KEY) as client: + @client.on(ServerMessageType.ADD_PARTIAL_TRANSCRIPT) + def handle_partials(msg): + if partial := TranscriptResult.from_message(msg).metadata.transcript: + print(f"[Partial]: {partial}") + + @client.on(ServerMessageType.ADD_TRANSCRIPT) + def handle_finals(msg): + if final := TranscriptResult.from_message(msg).metadata.transcript: + print(f"[Final]: {final}") + ``` + With both handlers registered, you'll see partials arrive first, then be superseded by the final result: + ``` + [Partial]: Hello wel + [Partial]: Hello welcome to + [Final]: Hello, welcome to Speechmatics. + ``` + + + ```javascript + await client.start(jwt, { + transcription_config: { + language: "en", + enable_partials: true, // Enable partial transcripts + }, + }); + + client.addEventListener("receiveMessage", ({ data }) => { + if (data.message === "AddPartialTranscript") { + process.stdout.write(`[Partial]: ${data.metadata.transcript}\r`); + } else if (data.message === "AddTranscript") { + console.log(`[Final]: ${data.metadata.transcript}`); + } + }); + ``` + With both handlers registered, you'll see partials arrive first, then be superseded by the final result: + ``` + [Partial]: Hello wel + [Partial]: Hello welcome to + [Final]: Hello, welcome to Speechmatics. + ``` + + -:::tip -Use partials for: real-time captions, voice interfaces, or any case where speed matters -::: +## Next steps + +Now that you have real-time transcription working, explore these features to build more powerful applications. + + + } + /> + } + /> + } + /> + } + /> + } + /> + } + /> + From d0f0a04b210c9d1fa85a19c122ff772316de30c4 Mon Sep 17 00:00:00 2001 From: James Walker Date: Thu, 26 Mar 2026 10:30:37 +0000 Subject: [PATCH 2/4] address rt spelling changes --- docs/speech-to-text/realtime/quickstart.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/speech-to-text/realtime/quickstart.mdx b/docs/speech-to-text/realtime/quickstart.mdx index 55fc31ee..6cc361a9 100644 --- a/docs/speech-to-text/realtime/quickstart.mdx +++ b/docs/speech-to-text/realtime/quickstart.mdx @@ -1,7 +1,7 @@ --- pagination_prev: null pagination_next: null -description: Learn how to transcribe streaming audio to text in real time. +description: Learn how to transcribe streaming audio to text in real-time. --- import Admonition from '@theme/Admonition'; @@ -18,7 +18,7 @@ import pythonRtExample from "./assets/sm-rt-example.py?raw" # Quickstart :::tip -The quickest way to try real-time transcription is via the [web portal](https://portal.speechmatics.com/jobs/create/real-time) — no code required. +The quickest way to try Realtime transcription is via the [web portal](https://portal.speechmatics.com/jobs/create/real-time) — no code required. ::: ## Using the Realtime API @@ -148,7 +148,7 @@ To receive partials, set `enable_partials=True` in your `TranscriptionConfig` an ## Next steps -Now that you have real-time transcription working, explore these features to build more powerful applications. +Now that you have Realtime transcription working, explore these features to build more powerful applications. Date: Thu, 26 Mar 2026 11:57:13 +0000 Subject: [PATCH 3/4] Match batch and rt steps --- docs/speech-to-text/batch/quickstart.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/speech-to-text/batch/quickstart.mdx b/docs/speech-to-text/batch/quickstart.mdx index 011c4ec8..f5563b53 100644 --- a/docs/speech-to-text/batch/quickstart.mdx +++ b/docs/speech-to-text/batch/quickstart.mdx @@ -62,9 +62,9 @@ Check out our [Batch Python client](https://github.com/speechmatics/speechmatics Download and save our [example.wav](https://github.com/speechmatics/speechmatics-js-sdk/raw/7d219bfee9166736e6aa21598535a194387b84be/examples/nodejs/example.wav) -### 4. Insert API key +### 4. Run the example -Paste your API key into `YOUR_API_KEY` in the code below. +Replace `YOUR_API_KEY` with your key, then run the script. From 2d3a4091f844609007f3c1ff06cab137ebf92746 Mon Sep 17 00:00:00 2001 From: James Walker Date: Thu, 26 Mar 2026 14:45:34 +0000 Subject: [PATCH 4/4] uplift rt JS docs --- .../assets/javascript-realtime-example.js | 58 +++++++++++++++++ docs/speech-to-text/realtime/quickstart.mdx | 65 +++++++++---------- 2 files changed, 90 insertions(+), 33 deletions(-) create mode 100644 docs/speech-to-text/realtime/assets/javascript-realtime-example.js diff --git a/docs/speech-to-text/realtime/assets/javascript-realtime-example.js b/docs/speech-to-text/realtime/assets/javascript-realtime-example.js new file mode 100644 index 00000000..9c7b6604 --- /dev/null +++ b/docs/speech-to-text/realtime/assets/javascript-realtime-example.js @@ -0,0 +1,58 @@ +import { spawn } from "node:child_process"; +import { createSpeechmaticsJWT } from "@speechmatics/auth"; +import { RealtimeClient } from "@speechmatics/real-time-client"; + +const apiKey = YOUR_API_KEY; +const client = new RealtimeClient(); + +const audio_format = { + type: "raw", + encoding: "pcm_s16le", + sample_rate: 44100, +}; + +async function transcribe() { + client.addEventListener("receiveMessage", ({ data }) => { + if (data.message === "AddTranscript") { + const transcript = data.metadata?.transcript; + if (transcript) console.log(`[Final]: ${transcript}`); + } else if (data.message === "Error") { + console.error(`Error [${data.type}]: ${data.reason}`); + process.exit(1); + } + }); + + const jwt = await createSpeechmaticsJWT({ type: "rt", apiKey, ttl: 60 }); + + await client.start(jwt, { + transcription_config: { + language: "en", + max_delay: 0.7 + }, + audio_format, + }); + + const recorder = spawn("sox", [ + "-d", // default audio device (mic) + "-q", // quiet + "-r", String(audio_format.sample_rate), // sample rate + "-e", "signed-integer", // match pcm_s16le + "-b", "16", // match pcm_s16le + "-c", "1", // mono + "-t", "raw", // raw PCM output + "-", // pipe to stdout + ]); + + recorder.stdout.on("data", (chunk) => client.sendAudio(chunk)); + recorder.stderr.on("data", (d) => console.error(`sox: ${d}`)); + + process.on("SIGINT", () => { + recorder.kill(); + client.stopRecognition({ noTimeout: true }); + }); +} + +transcribe().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/docs/speech-to-text/realtime/quickstart.mdx b/docs/speech-to-text/realtime/quickstart.mdx index 6cc361a9..c9b16efa 100644 --- a/docs/speech-to-text/realtime/quickstart.mdx +++ b/docs/speech-to-text/realtime/quickstart.mdx @@ -12,7 +12,7 @@ import { Grid } from '@radix-ui/themes'; import { LinkCard } from "@site/src/theme/LinkCard"; import { Users, BookMarked, Zap, Mic, Radio, Clock } from 'lucide-react'; -import javascriptRadioExample from "./assets/javascript-radio-example.js?raw" +import javascriptRtExample from "./assets/javascript-realtime-example.js?raw" import pythonRtExample from "./assets/sm-rt-example.py?raw" # Quickstart @@ -50,6 +50,9 @@ Enterprise customers may need to speak to [Support](https://support.speechmatics ``` npm install @speechmatics/real-time-client @speechmatics/auth ``` + :::note + This quickstart uses `sox` for microphone input. Install it with `brew install sox` (macOS) or `apt install sox` (Linux). + ::: @@ -62,44 +65,45 @@ Replace `YOUR_API_KEY` with your key, then run the script. {pythonRtExample} - Speak into your microphone. You should see output like: - ``` - [Final]: Hello, welcome to Speechmatics. - [Final]: This is a real-time transcription example. - ``` Press `Ctrl+C` to stop. - {javascriptRadioExample} + {javascriptRtExample} - This example transcribes a live radio stream. You should see a rolling transcript printed to the console. - Press `Ctrl+C` to stop. +Speak into your microphone. You should see output like: +``` +[Final]: Hello, welcome to Speechmatics. +[Final]: This is a real-time transcription example. +``` +Press `Ctrl+C` to stop. ## Understanding the output -The API returns two types of transcript results. You can use either or both depending on your use case. +The API returns two types of transcript results. **Finals** and **Partials**. + +**Finals** represent the best transcription for a span of audio and are never updated once emitted. + +**Partials** are emitted immediately as audio arrives and may be revised as more context is processed. | Type | Latency | Stability | Best for | |------|---------|-----------|----------| | **Final** | ~0.7–2s | Definitive, never revised | Accurate transcripts, subtitles | | **Partial** | <500ms | May be revised | Live captions, voice interfaces | -**Finals** represent the best transcription for a span of audio and are never updated once emitted. You can tune their latency using [`max_delay`](/speech-to-text/realtime/output#latency) — lower values reduce delay at the cost of slight accuracy. - -**Partials** are emitted immediately as audio arrives and may be revised as more context is processed. A common pattern is to display partials immediately, then replace them with finals as they arrive. +## Receiving Finals and Partials -To receive partials, set `enable_partials=True` in your `TranscriptionConfig` and register a handler for `ADD_PARTIAL_TRANSCRIPT`: +To receive partials, add the following changes and handlers to your code: - ```python + ```python {4,8-11} config = TranscriptionConfig( language="en", max_delay=0.7, - enable_partials=True, # Enable partial transcripts + enable_partials=True, ) async with AsyncClient(api_key=API_KEY) as client: @@ -113,38 +117,33 @@ To receive partials, set `enable_partials=True` in your `TranscriptionConfig` an if final := TranscriptResult.from_message(msg).metadata.transcript: print(f"[Final]: {final}") ``` - With both handlers registered, you'll see partials arrive first, then be superseded by the final result: - ``` - [Partial]: Hello wel - [Partial]: Hello welcome to - [Final]: Hello, welcome to Speechmatics. - ``` - ```javascript + ```javascript {5,12-13} await client.start(jwt, { transcription_config: { + max_delay: 0.7, language: "en", - enable_partials: true, // Enable partial transcripts + enable_partials: true, }, }); client.addEventListener("receiveMessage", ({ data }) => { - if (data.message === "AddPartialTranscript") { - process.stdout.write(`[Partial]: ${data.metadata.transcript}\r`); - } else if (data.message === "AddTranscript") { + if (data.message === "AddTranscript") { console.log(`[Final]: ${data.metadata.transcript}`); + } else if (data.message === "AddPartialTranscript") { + console.log(`[Partial]: ${data.metadata.transcript}\r`); } }); ``` - With both handlers registered, you'll see partials arrive first, then be superseded by the final result: - ``` - [Partial]: Hello wel - [Partial]: Hello welcome to - [Final]: Hello, welcome to Speechmatics. - ``` +With both handlers registered, you'll see partials arrive first, followed by the final result: +``` +[Partial]: Hello +[Partial]: Hello welcome to +[Final]: Hello, welcome to Speechmatics. +``` ## Next steps