diff --git a/fern/pages/01-getting-started/transcribe-streaming-audio.mdx b/fern/pages/01-getting-started/transcribe-streaming-audio.mdx index 6e8e2bc7..b19b1ee2 100644 --- a/fern/pages/01-getting-started/transcribe-streaming-audio.mdx +++ b/fern/pages/01-getting-started/transcribe-streaming-audio.mdx @@ -59,13 +59,6 @@ def on_begin(self: Type[StreamingClient], event: BeginEvent): def on_turn(self: Type[StreamingClient], event: TurnEvent): print(f"{event.transcript} ({event.end_of_turn})") - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - def on_terminated(self: Type[StreamingClient], event: TerminationEvent): print( @@ -1126,13 +1119,6 @@ def on_begin(self: Type[StreamingClient], event: BeginEvent): def on_turn(self: Type[StreamingClient], event: TurnEvent): print(f"{event.transcript} ({event.end_of_turn})") - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - def on_terminated(self: Type[StreamingClient], event: TerminationEvent): print( diff --git a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx b/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx index 3184d795..08df41eb 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx @@ -19,7 +19,7 @@ Multilingual currently supports: English, Spanish, French, German, Italian, and ## Language detection -The multilingual streaming model supports automatic language detection, allowing you to identify which language is being spoken in real-time. When enabled, the model returns the detected language code and confidence score with each complete utterance. +The multilingual streaming model supports automatic language detection, allowing you to identify which language is being spoken in real-time. When enabled, the model returns the detected language code and confidence score with each complete utterance and final turn. ### Configuration diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx index d5f0d30f..e2fcb251 100644 --- a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx +++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx @@ -169,13 +169,6 @@ print(f"Session started: {event.id}") def on_turn(self: Type[StreamingClient], event: TurnEvent): print(f"{event.transcript} ({event.end_of_turn})") - if event.end_of_turn and not event.turn_is_formatted: - params = StreamingSessionParameters( - format_turns=True, - ) - - self.set_params(params) - def on_terminated(self: Type[StreamingClient], event: TerminationEvent): print( f"Session terminated: {event.audio_duration_seconds} seconds of audio processed" diff --git a/llm-gateway.yml b/llm-gateway.yml index 9f22764e..ac23960b 100644 --- a/llm-gateway.yml +++ b/llm-gateway.yml @@ -81,7 +81,7 @@ paths: translation: target_languages: ["es", "de"] formal: true - # match_original_utterance: true + match_original_utterance: true speaker_identification_example: summary: Speaker identification request value: @@ -101,7 +101,6 @@ paths: date: "mm/dd/yyyy" phone_number: "(xxx)xxx-xxxx" email: "username@domain.com" - format_utterances: true responses: "200": description: Successful response containing the speech understanding results. @@ -112,6 +111,66 @@ paths: - $ref: "#/components/schemas/LLMGatewayTranslationResponse" - $ref: "#/components/schemas/LLMGatewaySpeakerIdentificationResponse" - $ref: "#/components/schemas/LLMGatewayCustomFormattingResponse" + examples: + complete_response: + summary: Complete response example + value: + speech_understanding: + request: + translation: + target_languages: ["es", "de"] + formal: true + match_original_utterance: true + speaker_identification: + speaker_type: "role" + known_values: ["interviewer", "candidate"] + custom_formatting: + date: "mm/dd/yyyy" + phone_number: "(xxx)xxx-xxxx" + email: "username@domain.com" + response: + translation: + status: "completed" + speaker_identification: + status: "completed" + custom_formatting: + status: "completed" + mapping: + "2024-12-25": "12/25/2024" + "555-1234-5678": "(555)123-45678" + formatted_text: "Call me at (555)123-45678 on 12/25/2024" + formatted_utterances: + - confidence: 0.92 + start: 0 + end: 2500 + text: "Hi, I'm the interviewer. Call me at (555)123-45678 on 12/25/2024" + speaker: "interviewer" + - confidence: 0.95 + start: 2500 + end: 5000 + text: "Thanks! I'll reach out then." + speaker: "candidate" + translated_texts: + es: "Hola, soy el entrevistador. Llámame al cinco cinco cinco uno dos tres cuatro cinco seis siete ocho el veinticinco de diciembre de dos mil veinticuatro. ¡Gracias! Me pondré en contacto entonces." + de: "Hallo, ich bin der Interviewer. Rufen Sie mich an unter fünf fünf fünf eins zwei drei vier fünf sechs sieben acht am fünfundzwanzigsten Dezember zweitausendvierundzwanzig. Danke! Ich werde mich dann melden." + utterances: + - confidence: 0.92 + start: 0 + end: 2500 + text: "Hi, I'm the interviewer. Call me at five five five one two three four five six seven eight on December twenty fifth twenty twenty four" + speaker: "interviewer" + translated_texts: + es: "Hola, soy el entrevistador. Llámame al cinco cinco cinco uno dos tres cuatro cinco seis siete ocho el veinticinco de diciembre de dos mil veinticuatro" + de: "Hallo, ich bin der Interviewer. Rufen Sie mich an unter fünf fünf fünf eins zwei drei vier fünf sechs sieben acht am fünfundzwanzigsten Dezember zweitausendvierundzwanzig" + - confidence: 0.95 + start: 2500 + end: 5000 + text: "Thanks! I'll reach out then." + speaker: "candidate" + translated_texts: + es: "¡Gracias! Me pondré en contacto entonces." + de: "Danke! Ich werde mich dann melden." + words: [] default: description: An unexpected error occurred. content: @@ -414,10 +473,10 @@ components: type: boolean description: Use formal language style default: true - # match_original_utterance: - # type: boolean - # description: Get translated utterances (if speaker_labels was enabled) - # default: false + match_original_utterance: + type: boolean + description: When enabled with Speaker Labels, returns translated text in the utterances array. Each utterance will include a `translated_texts` key containing translations for each target language. + default: false required: - target_languages required: @@ -458,10 +517,6 @@ components: email: type: string description: Email format pattern (e.g., `"username@domain.com"`) - format_utterances: - type: boolean - description: Whether to format utterances - default: true required: - custom_formatting @@ -484,12 +539,20 @@ components: translated_texts: type: object description: 'Translated text keyed by language code (e.g., `{"es": "Texto traducido"}`)' - # Unsure how to display dynamic keys. + additionalProperties: + type: string utterances: type: array + description: Array of utterances with translations (when `match_original_utterance` is true) items: type: object + properties: + translated_texts: + type: object + description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when `match_original_utterance` is enabled.' + additionalProperties: + type: string words: type: array items: @@ -544,14 +607,13 @@ components: type: string formatted_utterances: type: array + description: Array of formatted utterances. Only included when utterances exist and formatting was applied. items: type: object status: type: string required: - mapping - - formatted_text - - formatted_utterances - status utterances: type: array diff --git a/openapi.yml b/openapi.yml index 506b26b7..aef932ea 100644 --- a/openapi.yml +++ b/openapi.yml @@ -1296,7 +1296,7 @@ components: x-label: Code switching confidence threshold description: | The confidence threshold for code switching detection. If the code switching confidence is below this threshold, the transcript will be processed in the language with the highest `language_detection_confidence` score. - type: float + type: number minimum: 0 maximum: 1 default: 0.3 @@ -1634,7 +1634,7 @@ components: { request: { - translation: { target_languages: ["es", "de"], formal: true }, + translation: { target_languages: ["es", "de"], formal: true, match_original_utterance: true }, }, }, } @@ -1698,7 +1698,7 @@ components: { request: { - translation: { target_languages: ["es", "de"], formal: true }, + translation: { target_languages: ["es", "de"], formal: true, match_original_utterance: true }, }, }, } @@ -1822,6 +1822,12 @@ components: x-label: Speaker description: The speaker of this utterance, where each speaker is assigned a sequential capital letter - e.g. "A" for Speaker A, "B" for Speaker B, etc. type: string + translated_texts: + x-label: Translated texts + description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when `match_original_utterance` is enabled with translation.' + type: object + additionalProperties: + type: string required: - confidence - start @@ -2719,7 +2725,7 @@ components: x-label: Code switching confidence threshold description: | The confidence threshold for code switching detection. If the code switching confidence is below this threshold, the transcript will be processed in the language with the highest `language_detection_confidence` score. - type: float + type: number minimum: 0 maximum: 1 default: 0.3 @@ -6077,10 +6083,10 @@ components: type: boolean description: Use formal language style default: true - # match_original_utterance: - # type: boolean - # description: Get translated utterances (if speaker_labels was enabled) - # default: false + match_original_utterance: + type: boolean + description: When enabled with Speaker Labels, returns translated text in the utterances array. Each utterance will include a `translated_texts` key containing translations for each target language. + default: false required: - target_languages required: @@ -6121,10 +6127,6 @@ components: email: type: string description: Email format pattern (e.g., `"username@domain.com"`) - format_utterances: - type: boolean - description: Whether to format utterances - default: true required: - custom_formatting diff --git a/usm-streaming.yml b/usm-streaming.yml index 76d7a4a0..e570c111 100644 --- a/usm-streaming.yml +++ b/usm-streaming.yml @@ -80,7 +80,7 @@ channels: type: string language_detection: - description: Whether to detect the language of the audio stream + description: Whether to detect the language and return language metadata on utterances and final turns. Only available for the multilingual model. location: $message.payload#/language_detection enum: ["true", "false"] default: "false" @@ -121,6 +121,11 @@ channels: description: API token for authentication location: $message.payload#/token + vad_threshold: + description: The confidence threshold (0.0 to 1.0) for classifying audio frames as silence. Frames with VAD confidence below this value are considered silent. Increase for noisy environments to reduce false speech detection. + location: $message.payload#/vad_threshold + default: "0.4" + ApiKey: description: >- Use your API key for authentication, or alternatively generate a [temporary token](/docs/api-reference/streaming-api/generate-streaming-token) and pass it via the `token` query parameter.