From 72c26598a6a699635e264702d03c6f60f9a71019 Mon Sep 17 00:00:00 2001 From: gsharp-aai Date: Tue, 23 Dec 2025 15:28:50 -0800 Subject: [PATCH 1/3] Add all --- .../transcribe-streaming-audio.mdx | 14 --- .../universal-streaming/multilingual.mdx | 2 +- .../universal-streaming.mdx | 7 -- llm-gateway.yml | 92 ++++++++++++++++--- openapi.yml | 30 +++--- usm-streaming.yml | 7 +- 6 files changed, 100 insertions(+), 52 deletions(-) diff --git a/fern/pages/01-getting-started/transcribe-streaming-audio.mdx b/fern/pages/01-getting-started/transcribe-streaming-audio.mdx index 6e8e2bc7..b19b1ee2 100644 --- a/fern/pages/01-getting-started/transcribe-streaming-audio.mdx +++ b/fern/pages/01-getting-started/transcribe-streaming-audio.mdx @@ -59,13 +59,6 @@ def on_begin(self: Type[StreamingClient], event: BeginEvent): def on_turn(self: Type[StreamingClient], event: TurnEvent): print(f"{event.transcript} ({event.end_of_turn})") - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - def on_terminated(self: Type[StreamingClient], event: TerminationEvent): print( @@ -1126,13 +1119,6 @@ def on_begin(self: Type[StreamingClient], event: BeginEvent): def on_turn(self: Type[StreamingClient], event: TurnEvent): print(f"{event.transcript} ({event.end_of_turn})") - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - def on_terminated(self: Type[StreamingClient], event: TerminationEvent): print( diff --git a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx b/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx index 3184d795..08df41eb 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx @@ -19,7 +19,7 @@ Multilingual currently supports: English, Spanish, French, German, Italian, and ## Language detection -The multilingual streaming model supports automatic language detection, allowing you to identify which language is being spoken in real-time. When enabled, the model returns the detected language code and confidence score with each complete utterance. +The multilingual streaming model supports automatic language detection, allowing you to identify which language is being spoken in real-time. When enabled, the model returns the detected language code and confidence score with each complete utterance and final turn. ### Configuration diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx index d5f0d30f..e2fcb251 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx @@ -169,13 +169,6 @@ print(f"Session started: {event.id}") def on_turn(self: Type[StreamingClient], event: TurnEvent): print(f"{event.transcript} ({event.end_of_turn})") - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - def on_terminated(self: Type[StreamingClient], event: TerminationEvent): print( f"Session terminated: {event.audio_duration_seconds} seconds of audio processed" diff --git a/llm-gateway.yml b/llm-gateway.yml index 9f22764e..c0fa0e46 100644 --- a/llm-gateway.yml +++ b/llm-gateway.yml @@ -81,7 +81,7 @@ paths: translation: target_languages: ["es", "de"] formal: true - # match_original_utterance: true + match_original_utterance: true speaker_identification_example: summary: Speaker identification request value: @@ -101,7 +101,6 @@ paths: date: "mm/dd/yyyy" phone_number: "(xxx)xxx-xxxx" email: "username@domain.com" - format_utterances: true responses: "200": description: Successful response containing the speech understanding results. @@ -112,6 +111,66 @@ paths: - $ref: "#/components/schemas/LLMGatewayTranslationResponse" - $ref: "#/components/schemas/LLMGatewaySpeakerIdentificationResponse" - $ref: "#/components/schemas/LLMGatewayCustomFormattingResponse" + examples: + complete_response: + summary: Complete response example + value: + speech_understanding: + request: + translation: + target_languages: ["es", "de"] + formal: true + match_original_utterance: true + speaker_identification: + speaker_type: "role" + known_values: ["interviewer", "candidate"] + custom_formatting: + date: "mm/dd/yyyy" + phone_number: "(xxx)xxx-xxxx" + email: "username@domain.com" + response: + translation: + status: "completed" + speaker_identification: + status: "completed" + custom_formatting: + status: "completed" + mapping: + "2024-12-25": "12/25/2024" + "555-1234-5678": "(555)123-45678" + formatted_text: "Call me at (555)123-45678 on 12/25/2024" + formatted_utterances: + - confidence: 0.92 + start: 0 + end: 2500 + text: "Hi, I'm the interviewer. Call me at (555)123-45678 on 12/25/2024" + speaker: "interviewer" + - confidence: 0.95 + start: 2500 + end: 5000 + text: "Thanks! I'll reach out then." + speaker: "candidate" + translated_texts: + es: "Hola, soy el entrevistador. Llámame al cinco cinco cinco uno dos tres cuatro cinco seis siete ocho el veinticinco de diciembre de dos mil veinticuatro. ¡Gracias! Me pondré en contacto entonces." + de: "Hallo, ich bin der Interviewer. Rufen Sie mich an unter fünf fünf fünf eins zwei drei vier fünf sechs sieben acht am fünfundzwanzigsten Dezember zweitausendvierundzwanzig. Danke! Ich werde mich dann melden." + utterances: + - confidence: 0.92 + start: 0 + end: 2500 + text: "Hi, I'm the interviewer. Call me at five five five one two three four five six seven eight on December twenty fifth twenty twenty four" + speaker: "interviewer" + translated_texts: + es: "Hola, soy el entrevistador. Llámame al cinco cinco cinco uno dos tres cuatro cinco seis siete ocho el veinticinco de diciembre de dos mil veinticuatro" + de: "Hallo, ich bin der Interviewer. Rufen Sie mich an unter fünf fünf fünf eins zwei drei vier fünf sechs sieben acht am fünfundzwanzigsten Dezember zweitausendvierundzwanzig" + - confidence: 0.95 + start: 2500 + end: 5000 + text: "Thanks! I'll reach out then." + speaker: "candidate" + translated_texts: + es: "¡Gracias! Me pondré en contacto entonces." + de: "Danke! Ich werde mich dann melden." + words: [] default: description: An unexpected error occurred. content: @@ -414,10 +473,10 @@ components: type: boolean description: Use formal language style default: true - # match_original_utterance: - # type: boolean - # description: Get translated utterances (if speaker_labels was enabled) - # default: false + match_original_utterance: + type: boolean + description: When enabled with Speaker Labels, returns translated text in the utterances array. Each utterance will include a `translated_texts` key containing translations for each target language. + default: false required: - target_languages required: @@ -431,13 +490,13 @@ components: properties: speaker_type: type: string - enum: [role, name] + enum: [role, name, name_role] description: Type of speaker identification known_values: type: array items: type: string - description: Required if speaker_type is "role". Each value must be 35 characters or less. + description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less. required: - speaker_type required: @@ -458,10 +517,6 @@ components: email: type: string description: Email format pattern (e.g., `"username@domain.com"`) - format_utterances: - type: boolean - description: Whether to format utterances - default: true required: - custom_formatting @@ -484,12 +539,20 @@ components: translated_texts: type: object description: 'Translated text keyed by language code (e.g., `{"es": "Texto traducido"}`)' - # Unsure how to display dynamic keys. + additionalProperties: + type: string utterances: type: array + description: Array of utterances with translations (when match_original_utterance is true) items: type: object + properties: + translated_texts: + type: object + description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled.' + additionalProperties: + type: string words: type: array items: @@ -544,14 +607,13 @@ components: type: string formatted_utterances: type: array + description: Array of formatted utterances. Only included when utterances exist and formatting was applied. items: type: object status: type: string required: - mapping - - formatted_text - - formatted_utterances - status utterances: type: array diff --git a/openapi.yml b/openapi.yml index 506b26b7..5ffd3243 100644 --- a/openapi.yml +++ b/openapi.yml @@ -1296,7 +1296,7 @@ components: x-label: Code switching confidence threshold description: | The confidence threshold for code switching detection. If the code switching confidence is below this threshold, the transcript will be processed in the language with the highest `language_detection_confidence` score. - type: float + type: number minimum: 0 maximum: 1 default: 0.3 @@ -1634,7 +1634,7 @@ components: { request: { - translation: { target_languages: ["es", "de"], formal: true }, + translation: { target_languages: ["es", "de"], formal: true, match_original_utterance: true }, }, }, } @@ -1698,7 +1698,7 @@ components: { request: { - translation: { target_languages: ["es", "de"], formal: true }, + translation: { target_languages: ["es", "de"], formal: true, match_original_utterance: true }, }, }, } @@ -1822,6 +1822,12 @@ components: x-label: Speaker description: The speaker of this utterance, where each speaker is assigned a sequential capital letter - e.g. "A" for Speaker A, "B" for Speaker B, etc. type: string + translated_texts: + x-label: Translated texts + description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled with translation.' + type: object + additionalProperties: + type: string required: - confidence - start @@ -2719,7 +2725,7 @@ components: x-label: Code switching confidence threshold description: | The confidence threshold for code switching detection. If the code switching confidence is below this threshold, the transcript will be processed in the language with the highest `language_detection_confidence` score. - type: float + type: number minimum: 0 maximum: 1 default: 0.3 @@ -6077,10 +6083,10 @@ components: type: boolean description: Use formal language style default: true - # match_original_utterance: - # type: boolean - # description: Get translated utterances (if speaker_labels was enabled) - # default: false + match_original_utterance: + type: boolean + description: When enabled with Speaker Labels, returns translated text in the utterances array. Each utterance will include a `translated_texts` key containing translations for each target language. + default: false required: - target_languages required: @@ -6094,13 +6100,13 @@ components: properties: speaker_type: type: string - enum: [role, name] + enum: [role, name, name_role] description: Type of speaker identification known_values: type: array items: type: string - description: Required if speaker_type is "role". Each value must be 35 characters or less. + description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less. required: - speaker_type required: @@ -6121,10 +6127,6 @@ components: email: type: string description: Email format pattern (e.g., `"username@domain.com"`) - format_utterances: - type: boolean - description: Whether to format utterances - default: true required: - custom_formatting diff --git a/usm-streaming.yml b/usm-streaming.yml index 76d7a4a0..e570c111 100644 --- a/usm-streaming.yml +++ b/usm-streaming.yml @@ -80,7 +80,7 @@ channels: type: string language_detection: - description: Whether to detect the language of the audio stream + description: Whether to detect the language and return language metadata on utterances and final turns. Only available for the multilingual model. location: $message.payload#/language_detection enum: ["true", "false"] default: "false" @@ -121,6 +121,11 @@ channels: description: API token for authentication location: $message.payload#/token + vad_threshold: + description: The confidence threshold (0.0 to 1.0) for classifying audio frames as silence. Frames with VAD confidence below this value are considered silent. Increase for noisy environments to reduce false speech detection. + location: $message.payload#/vad_threshold + default: "0.4" + ApiKey: description: >- Use your API key for authentication, or alternatively generate a [temporary token](/docs/api-reference/streaming-api/generate-streaming-token) and pass it via the `token` query parameter. From 862df4d3c1631511f594f2039649db841121fdfe Mon Sep 17 00:00:00 2001 From: gsharp-aai Date: Tue, 23 Dec 2025 17:04:29 -0800 Subject: [PATCH 2/3] Add edits --- llm-gateway.yml | 6 +++--- openapi.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llm-gateway.yml b/llm-gateway.yml index c0fa0e46..0a6bc9f3 100644 --- a/llm-gateway.yml +++ b/llm-gateway.yml @@ -490,13 +490,13 @@ components: properties: speaker_type: type: string - enum: [role, name, name_role] + enum: [role, name] description: Type of speaker identification known_values: type: array items: type: string - description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less. + description: Required if speaker_type is “role”. Each value must be 35 characters or less. required: - speaker_type required: @@ -550,7 +550,7 @@ components: properties: translated_texts: type: object - description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled.' + description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when `match_original_utterance` is enabled.' additionalProperties: type: string words: diff --git a/openapi.yml b/openapi.yml index 5ffd3243..9899f3e1 100644 --- a/openapi.yml +++ b/openapi.yml @@ -1824,7 +1824,7 @@ components: type: string translated_texts: x-label: Translated texts - description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled with translation.' + description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when `match_original_utterance` is enabled with translation.' type: object additionalProperties: type: string @@ -6100,13 +6100,13 @@ components: properties: speaker_type: type: string - enum: [role, name, name_role] + enum: [role, name] description: Type of speaker identification known_values: type: array items: type: string - description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less. + description: Required if speaker_type is “role”. Each value must be 35 characters or less. required: - speaker_type required: From 01916ffb69c5218a85cbf8567ae05e8b4d8eb4ff Mon Sep 17 00:00:00 2001 From: gsharp-aai Date: Tue, 23 Dec 2025 17:15:48 -0800 Subject: [PATCH 3/3] Add final edit --- llm-gateway.yml | 4 ++-- openapi.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/llm-gateway.yml b/llm-gateway.yml index 0a6bc9f3..ac23960b 100644 --- a/llm-gateway.yml +++ b/llm-gateway.yml @@ -496,7 +496,7 @@ components: type: array items: type: string - description: Required if speaker_type is “role”. Each value must be 35 characters or less. + description: Required if speaker_type is "role". Each value must be 35 characters or less. required: - speaker_type required: @@ -544,7 +544,7 @@ components: utterances: type: array - description: Array of utterances with translations (when match_original_utterance is true) + description: Array of utterances with translations (when `match_original_utterance` is true) items: type: object properties: diff --git a/openapi.yml b/openapi.yml index 9899f3e1..aef932ea 100644 --- a/openapi.yml +++ b/openapi.yml @@ -6106,7 +6106,7 @@ components: type: array items: type: string - description: Required if speaker_type is “role”. Each value must be 35 characters or less. + description: Required if speaker_type is "role". Each value must be 35 characters or less. required: - speaker_type required: