From 72c26598a6a699635e264702d03c6f60f9a71019 Mon Sep 17 00:00:00 2001
From: gsharp-aai <gsharp@assemblyai.com>
Date: Tue, 23 Dec 2025 15:28:50 -0800
Subject: [PATCH 1/3] Add all

---
 .../transcribe-streaming-audio.mdx            | 14 ---
 .../universal-streaming/multilingual.mdx      |  2 +-
 .../universal-streaming.mdx                   |  7 --
 llm-gateway.yml                               | 92 ++++++++++++++++---
 openapi.yml                                   | 30 +++---
 usm-streaming.yml                             |  7 +-
 6 files changed, 100 insertions(+), 52 deletions(-)

diff --git a/fern/pages/01-getting-started/transcribe-streaming-audio.mdx b/fern/pages/01-getting-started/transcribe-streaming-audio.mdx
index 6e8e2bc7..b19b1ee2 100644
--- a/fern/pages/01-getting-started/transcribe-streaming-audio.mdx
+++ b/fern/pages/01-getting-started/transcribe-streaming-audio.mdx
@@ -59,13 +59,6 @@ def on_begin(self: Type[StreamingClient], event: BeginEvent):
 def on_turn(self: Type[StreamingClient], event: TurnEvent):
     print(f"{event.transcript} ({event.end_of_turn})")
 
-    if event.end_of_turn and not event.turn_is_formatted:
-        params = StreamingSessionParameters(
-            format_turns=True,
-        )
-
-        self.set_params(params)
-
 
 def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
     print(
@@ -1126,13 +1119,6 @@ def on_begin(self: Type[StreamingClient], event: BeginEvent):
 def on_turn(self: Type[StreamingClient], event: TurnEvent):
     print(f"{event.transcript} ({event.end_of_turn})")
 
-    if event.end_of_turn and not event.turn_is_formatted:
-        params = StreamingSessionParameters(
-            format_turns=True,
-        )
-
-        self.set_params(params)
-
 
 def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
     print(
diff --git a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx b/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx
index 3184d795..08df41eb 100644
--- a/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx
+++ b/fern/pages/02-speech-to-text/universal-streaming/multilingual.mdx
@@ -19,7 +19,7 @@ Multilingual currently supports: English, Spanish, French, German, Italian, and
 
 ## Language detection
 
-The multilingual streaming model supports automatic language detection, allowing you to identify which language is being spoken in real-time. When enabled, the model returns the detected language code and confidence score with each complete utterance.
+The multilingual streaming model supports automatic language detection, allowing you to identify which language is being spoken in real-time. When enabled, the model returns the detected language code and confidence score with each complete utterance and final turn.
 
 ### Configuration
 
diff --git a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx
index d5f0d30f..e2fcb251 100644
--- a/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx
+++ b/fern/pages/02-speech-to-text/universal-streaming/universal-streaming.mdx
@@ -169,13 +169,6 @@ print(f"Session started: {event.id}")
 def on_turn(self: Type[StreamingClient], event: TurnEvent):
 print(f"{event.transcript} ({event.end_of_turn})")
 
-    if event.end_of_turn and not event.turn_is_formatted:
-        params = StreamingSessionParameters(
-            format_turns=True,
-        )
-
-        self.set_params(params)
-
 def on_terminated(self: Type[StreamingClient], event: TerminationEvent):
 print(
 f"Session terminated: {event.audio_duration_seconds} seconds of audio processed"
diff --git a/llm-gateway.yml b/llm-gateway.yml
index 9f22764e..c0fa0e46 100644
--- a/llm-gateway.yml
+++ b/llm-gateway.yml
@@ -81,7 +81,7 @@ paths:
                       translation:
                         target_languages: ["es", "de"]
                         formal: true
-                        # match_original_utterance: true
+                        match_original_utterance: true
               speaker_identification_example:
                 summary: Speaker identification request
                 value:
@@ -101,7 +101,6 @@ paths:
                         date: "mm/dd/yyyy"
                         phone_number: "(xxx)xxx-xxxx"
                         email: "username@domain.com"
-                        format_utterances: true
       responses:
         "200":
           description: Successful response containing the speech understanding results.
@@ -112,6 +111,66 @@ paths:
                   - $ref: "#/components/schemas/LLMGatewayTranslationResponse"
                   - $ref: "#/components/schemas/LLMGatewaySpeakerIdentificationResponse"
                   - $ref: "#/components/schemas/LLMGatewayCustomFormattingResponse"
+              examples:
+                complete_response:
+                  summary: Complete response example
+                  value:
+                    speech_understanding:
+                      request:
+                        translation:
+                          target_languages: ["es", "de"]
+                          formal: true
+                          match_original_utterance: true
+                        speaker_identification:
+                          speaker_type: "role"
+                          known_values: ["interviewer", "candidate"]
+                        custom_formatting:
+                          date: "mm/dd/yyyy"
+                          phone_number: "(xxx)xxx-xxxx"
+                          email: "username@domain.com"
+                      response:
+                        translation:
+                          status: "completed"
+                        speaker_identification:
+                          status: "completed"
+                        custom_formatting:
+                          status: "completed"
+                          mapping:
+                            "2024-12-25": "12/25/2024"
+                            "555-1234-5678": "(555)123-45678"
+                          formatted_text: "Call me at (555)123-45678 on 12/25/2024"
+                          formatted_utterances:
+                            - confidence: 0.92
+                              start: 0
+                              end: 2500
+                              text: "Hi, I'm the interviewer. Call me at (555)123-45678 on 12/25/2024"
+                              speaker: "interviewer"
+                            - confidence: 0.95
+                              start: 2500
+                              end: 5000
+                              text: "Thanks! I'll reach out then."
+                              speaker: "candidate"
+                    translated_texts:
+                      es: "Hola, soy el entrevistador. Llámame al cinco cinco cinco uno dos tres cuatro cinco seis siete ocho el veinticinco de diciembre de dos mil veinticuatro. ¡Gracias! Me pondré en contacto entonces."
+                      de: "Hallo, ich bin der Interviewer. Rufen Sie mich an unter fünf fünf fünf eins zwei drei vier fünf sechs sieben acht am fünfundzwanzigsten Dezember zweitausendvierundzwanzig. Danke! Ich werde mich dann melden."
+                    utterances:
+                      - confidence: 0.92
+                        start: 0
+                        end: 2500
+                        text: "Hi, I'm the interviewer. Call me at five five five one two three four five six seven eight on December twenty fifth twenty twenty four"
+                        speaker: "interviewer"
+                        translated_texts:
+                          es: "Hola, soy el entrevistador. Llámame al cinco cinco cinco uno dos tres cuatro cinco seis siete ocho el veinticinco de diciembre de dos mil veinticuatro"
+                          de: "Hallo, ich bin der Interviewer. Rufen Sie mich an unter fünf fünf fünf eins zwei drei vier fünf sechs sieben acht am fünfundzwanzigsten Dezember zweitausendvierundzwanzig"
+                      - confidence: 0.95
+                        start: 2500
+                        end: 5000
+                        text: "Thanks! I'll reach out then."
+                        speaker: "candidate"
+                        translated_texts:
+                          es: "¡Gracias! Me pondré en contacto entonces."
+                          de: "Danke! Ich werde mich dann melden."
+                    words: []
         default:
           description: An unexpected error occurred.
           content:
@@ -414,10 +473,10 @@ components:
               type: boolean
               description: Use formal language style
               default: true
-            # match_original_utterance:
-            #   type: boolean
-            #   description: Get translated utterances (if speaker_labels was enabled)
-            #   default: false
+            match_original_utterance:
+              type: boolean
+              description: When enabled with Speaker Labels, returns translated text in the utterances array. Each utterance will include a `translated_texts` key containing translations for each target language.
+              default: false
           required:
             - target_languages
       required:
@@ -431,13 +490,13 @@ components:
           properties:
             speaker_type:
               type: string
-              enum: [role, name]
+              enum: [role, name, name_role]
               description: Type of speaker identification
             known_values:
               type: array
               items:
                 type: string
-              description: Required if speaker_type is "role". Each value must be 35 characters or less.
+              description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less.
           required:
             - speaker_type
       required:
@@ -458,10 +517,6 @@ components:
             email:
               type: string
               description: Email format pattern (e.g., `"username@domain.com"`)
-            format_utterances:
-              type: boolean
-              description: Whether to format utterances
-              default: true
       required:
         - custom_formatting
 
@@ -484,12 +539,20 @@ components:
         translated_texts:
           type: object
           description: 'Translated text keyed by language code (e.g., `{"es": "Texto traducido"}`)'
-          # Unsure how to display dynamic keys.
+          additionalProperties:
+            type: string
 
         utterances:
           type: array
+          description: Array of utterances with translations (when match_original_utterance is true)
           items:
             type: object
+            properties:
+              translated_texts:
+                type: object
+                description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled.'
+                additionalProperties:
+                  type: string
         words:
           type: array
           items:
@@ -544,14 +607,13 @@ components:
                       type: string
                     formatted_utterances:
                       type: array
+                      description: Array of formatted utterances. Only included when utterances exist and formatting was applied.
                       items:
                         type: object
                     status:
                       type: string
                   required:
                     - mapping
-                    - formatted_text
-                    - formatted_utterances
                     - status
         utterances:
           type: array
diff --git a/openapi.yml b/openapi.yml
index 506b26b7..5ffd3243 100644
--- a/openapi.yml
+++ b/openapi.yml
@@ -1296,7 +1296,7 @@ components:
               x-label: Code switching confidence threshold
               description: |
                 The confidence threshold for code switching detection. If the code switching confidence is below this threshold, the transcript will be processed in the language with the highest `language_detection_confidence` score.
-              type: float
+              type: number
               minimum: 0
               maximum: 1
               default: 0.3
@@ -1634,7 +1634,7 @@ components:
             {
               request:
                 {
-                  translation: { target_languages: ["es", "de"], formal: true },
+                  translation: { target_languages: ["es", "de"], formal: true, match_original_utterance: true },
                 },
             },
         }
@@ -1698,7 +1698,7 @@ components:
             {
               request:
                 {
-                  translation: { target_languages: ["es", "de"], formal: true },
+                  translation: { target_languages: ["es", "de"], formal: true, match_original_utterance: true },
                 },
             },
         }
@@ -1822,6 +1822,12 @@ components:
           x-label: Speaker
           description: The speaker of this utterance, where each speaker is assigned a sequential capital letter - e.g. "A" for Speaker A, "B" for Speaker B, etc.
           type: string
+        translated_texts:
+          x-label: Translated texts
+          description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled with translation.'
+          type: object
+          additionalProperties:
+            type: string
       required:
         - confidence
         - start
@@ -2719,7 +2725,7 @@ components:
               x-label: Code switching confidence threshold
               description: |
                 The confidence threshold for code switching detection. If the code switching confidence is below this threshold, the transcript will be processed in the language with the highest `language_detection_confidence` score.
-              type: float
+              type: number
               minimum: 0
               maximum: 1
               default: 0.3
@@ -6077,10 +6083,10 @@ components:
               type: boolean
               description: Use formal language style
               default: true
-            # match_original_utterance:
-            #   type: boolean
-            #   description: Get translated utterances (if speaker_labels was enabled)
-            #   default: false
+            match_original_utterance:
+              type: boolean
+              description: When enabled with Speaker Labels, returns translated text in the utterances array. Each utterance will include a `translated_texts` key containing translations for each target language.
+              default: false
           required:
             - target_languages
       required:
@@ -6094,13 +6100,13 @@ components:
           properties:
             speaker_type:
               type: string
-              enum: [role, name]
+              enum: [role, name, name_role]
               description: Type of speaker identification
             known_values:
               type: array
               items:
                 type: string
-              description: Required if speaker_type is "role". Each value must be 35 characters or less.
+              description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less.
           required:
             - speaker_type
       required:
@@ -6121,10 +6127,6 @@ components:
             email:
               type: string
               description: Email format pattern (e.g., `"username@domain.com"`)
-            format_utterances:
-              type: boolean
-              description: Whether to format utterances
-              default: true
       required:
         - custom_formatting
 
diff --git a/usm-streaming.yml b/usm-streaming.yml
index 76d7a4a0..e570c111 100644
--- a/usm-streaming.yml
+++ b/usm-streaming.yml
@@ -80,7 +80,7 @@ channels:
             type: string
 
       language_detection:
-        description: Whether to detect the language of the audio stream
+        description: Whether to detect the language and return language metadata on utterances and final turns. Only available for the multilingual model.
         location: $message.payload#/language_detection
         enum: ["true", "false"]
         default: "false"
@@ -121,6 +121,11 @@ channels:
         description: API token for authentication
         location: $message.payload#/token
 
+      vad_threshold:
+        description: The confidence threshold (0.0 to 1.0) for classifying audio frames as silence. Frames with VAD confidence below this value are considered silent. Increase for noisy environments to reduce false speech detection.
+        location: $message.payload#/vad_threshold
+        default: "0.4"
+
       ApiKey:
         description: >-
           Use your API key for authentication, or alternatively generate a [temporary token](/docs/api-reference/streaming-api/generate-streaming-token) and pass it via the `token` query parameter.

From 862df4d3c1631511f594f2039649db841121fdfe Mon Sep 17 00:00:00 2001
From: gsharp-aai <gsharp@assemblyai.com>
Date: Tue, 23 Dec 2025 17:04:29 -0800
Subject: [PATCH 2/3] Add edits

---
 llm-gateway.yml | 6 +++---
 openapi.yml     | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llm-gateway.yml b/llm-gateway.yml
index c0fa0e46..0a6bc9f3 100644
--- a/llm-gateway.yml
+++ b/llm-gateway.yml
@@ -490,13 +490,13 @@ components:
           properties:
             speaker_type:
               type: string
-              enum: [role, name, name_role]
+              enum: [role, name]
               description: Type of speaker identification
             known_values:
               type: array
               items:
                 type: string
-              description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less.
+              description: Required if speaker_type is “role”. Each value must be 35 characters or less.
           required:
             - speaker_type
       required:
@@ -550,7 +550,7 @@ components:
             properties:
               translated_texts:
                 type: object
-                description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled.'
+                description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when `match_original_utterance` is enabled.'
                 additionalProperties:
                   type: string
         words:
diff --git a/openapi.yml b/openapi.yml
index 5ffd3243..9899f3e1 100644
--- a/openapi.yml
+++ b/openapi.yml
@@ -1824,7 +1824,7 @@ components:
           type: string
         translated_texts:
           x-label: Translated texts
-          description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when match_original_utterance is enabled with translation.'
+          description: 'Translations keyed by language code (e.g., `{"es": "Texto traducido", "de": "Übersetzter Text"}`). Only present when `match_original_utterance` is enabled with translation.'
           type: object
           additionalProperties:
             type: string
@@ -6100,13 +6100,13 @@ components:
           properties:
             speaker_type:
               type: string
-              enum: [role, name, name_role]
+              enum: [role, name]
               description: Type of speaker identification
             known_values:
               type: array
               items:
                 type: string
-              description: Optional list of known speaker names or roles. When provided, helps identify speakers more accurately. Each value must be 35 characters or less.
+              description: Required if speaker_type is “role”. Each value must be 35 characters or less.
           required:
             - speaker_type
       required:

From 01916ffb69c5218a85cbf8567ae05e8b4d8eb4ff Mon Sep 17 00:00:00 2001
From: gsharp-aai <gsharp@assemblyai.com>
Date: Tue, 23 Dec 2025 17:15:48 -0800
Subject: [PATCH 3/3] Add final edit

---
 llm-gateway.yml | 4 ++--
 openapi.yml     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm-gateway.yml b/llm-gateway.yml
index 0a6bc9f3..ac23960b 100644
--- a/llm-gateway.yml
+++ b/llm-gateway.yml
@@ -496,7 +496,7 @@ components:
               type: array
               items:
                 type: string
-              description: Required if speaker_type is “role”. Each value must be 35 characters or less.
+              description: Required if speaker_type is "role". Each value must be 35 characters or less.
           required:
             - speaker_type
       required:
@@ -544,7 +544,7 @@ components:
 
         utterances:
           type: array
-          description: Array of utterances with translations (when match_original_utterance is true)
+          description: Array of utterances with translations (when `match_original_utterance` is true)
           items:
             type: object
             properties:
diff --git a/openapi.yml b/openapi.yml
index 9899f3e1..aef932ea 100644
--- a/openapi.yml
+++ b/openapi.yml
@@ -6106,7 +6106,7 @@ components:
               type: array
               items:
                 type: string
-              description: Required if speaker_type is “role”. Each value must be 35 characters or less.
+              description: Required if speaker_type is "role". Each value must be 35 characters or less.
           required:
             - speaker_type
       required: