diff --git a/CHANGELOG.md b/CHANGELOG.md index 3796893..10c09f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Changelog +## [4.20.0] + +- Add `on_low_language_confidence` property to `LanguageDetectionOptions` + > Controls behavior when language confidence is below threshold. Either "error" (default) or "fallback". + > When set to "fallback", the transcription will use the fallback language instead of erroring when confidence is low. + ## [4.8.0] - Add `multichannel` property to `TranscriptParams` diff --git a/docs/compat.md b/docs/compat.md index 7300c0c..9a4ca0e 100644 --- a/docs/compat.md +++ b/docs/compat.md @@ -13,8 +13,8 @@ If you do use an older version of Node.js like version 16, you'll need to polyfi To make the SDK compatible with the browser, the SDK aims to use web standards as much as possible. However, there are still incompatibilities between Node.js and the browser. -- `StreamingTranscriber` doesn't support the AssemblyAI API key in the browser. - Instead, you have to generate a temporary auth token using `client.streaming.createTemporaryToken`, and pass in the resulting token to the streaming transcriber. +- `RealtimeTranscriber` doesn't support the AssemblyAI API key in the browser. + Instead, you have to generate a temporary auth token using `client.realtime.createTemporaryToken`, and pass in the resulting token to the real-time transcriber. Generate a temporary auth token on the server. @@ -23,7 +23,7 @@ However, there are still incompatibilities between Node.js and the browser. // Ideally, to avoid embedding your API key client side, // you generate this token on the server, and pass it to the client via an API. const client = new AssemblyAI({ apiKey: "YOUR_API_KEY" }); - const token = await client.streaming.createTemporaryToken({ expires_in_seconds: 60 }); + const token = await client.realtime.createTemporaryToken({ expires_in = 480 }); ``` > [!NOTE] @@ -31,16 +31,16 @@ However, there are still incompatibilities between Node.js and the browser. > If you embed the API key on the client, everyone can see it and use it for themselves. Then pass the token via an API to the client. - On the client, create an instance of `StreamingTranscriber` using the token. + On the client, create an instance of `RealtimeTranscriber` using the token. ```js - import { StreamingTranscriber } from "assemblyai"; + import { RealtimeTranscriber } from "assemblyai"; // or the following if you're using UMD - // const { StreamingTranscriber } = assemblyai; + // const { RealtimeTranscriber } = assemblyai; const token = getToken(); // getToken is a function for you to implement - const rt = new StreamingTranscriber({ + const rt = new RealtimeTranscriber({ token: token, }); ``` diff --git a/package.json b/package.json index d8e967a..e2edb47 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "assemblyai", - "version": "4.19.0", + "version": "4.21.0", "description": "The AssemblyAI JavaScript SDK provides an easy-to-use interface for interacting with the AssemblyAI API, which supports async and real-time transcription, as well as the latest LeMUR models.", "engines": { "node": ">=18" diff --git a/src/services/streaming/service.ts b/src/services/streaming/service.ts index 8b8430a..111d371 100644 --- a/src/services/streaming/service.ts +++ b/src/services/streaming/service.ts @@ -97,6 +97,10 @@ export class StreamingTranscriber { ); } + if (this.params.vadThreshold !== undefined) { + searchParams.set("vad_threshold", this.params.vadThreshold.toString()); + } + if (this.params.formatTurns) { searchParams.set("format_turns", this.params.formatTurns.toString()); } @@ -128,6 +132,20 @@ export class StreamingTranscriber { searchParams.set("speech_model", this.params.speechModel.toString()); } + if (this.params.languageDetection !== undefined) { + searchParams.set( + "language_detection", + this.params.languageDetection.toString(), + ); + } + + if (this.params.inactivityTimeout !== undefined) { + searchParams.set( + "inactivity_timeout", + this.params.inactivityTimeout.toString(), + ); + } + url.search = searchParams.toString(); return url; diff --git a/src/types/openapi.generated.ts b/src/types/openapi.generated.ts index e6b4f13..2fda274 100644 --- a/src/types/openapi.generated.ts +++ b/src/types/openapi.generated.ts @@ -1690,6 +1690,10 @@ export type LanguageDetectionOptions = { * The confidence threshold for the automatically detected code switching language. */ code_switching_confidence_threshold?: number | null; + /** + * Controls behavior when language confidence is below threshold. Either "error" (default) or "fallback". + */ + on_low_language_confidence?: string | null; }; /** diff --git a/src/types/streaming/index.ts b/src/types/streaming/index.ts index 3faf00e..252de74 100644 --- a/src/types/streaming/index.ts +++ b/src/types/streaming/index.ts @@ -9,11 +9,14 @@ export type StreamingTranscriberParams = { endOfTurnConfidenceThreshold?: number; minEndOfTurnSilenceWhenConfident?: number; maxTurnSilence?: number; + vadThreshold?: number; formatTurns?: boolean; filterProfanity?: boolean; keyterms?: string[]; keytermsPrompt?: string[]; speechModel?: StreamingSpeechModel; + languageDetection?: boolean; + inactivityTimeout?: number; }; export type StreamingEvents = "open" | "close" | "turn" | "error"; @@ -54,6 +57,8 @@ export type TurnEvent = { transcript: string; end_of_turn_confidence: number; words: StreamingWord[]; + language_code?: string; + language_confidence?: number; }; export type StreamingWord = { @@ -79,6 +84,7 @@ export type StreamingUpdateConfiguration = { end_of_turn_confidence_threshold?: number; min_end_of_turn_silence_when_confident?: number; max_turn_silence?: number; + vad_threshold?: number; format_turns?: boolean; }; diff --git a/tests/unit/language-detection-options.test.ts b/tests/unit/language-detection-options.test.ts index 380633b..8189679 100644 --- a/tests/unit/language-detection-options.test.ts +++ b/tests/unit/language-detection-options.test.ts @@ -143,4 +143,56 @@ describe("language detection options", () => { const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); expect(requestBody.language_detection_options).toBe(null); }); + + it("should create transcript with on_low_language_confidence set to fallback", async () => { + const languageDetectionOptions: LanguageDetectionOptions = { + fallback_language: "en", + on_low_language_confidence: "fallback", + }; + + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + language_detection: true, + language_confidence_threshold: 0.8, + language_detection_options: languageDetectionOptions, + }); + + expect(transcript.id).toBe(transcriptId); + + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.language_confidence_threshold).toBe(0.8); + expect(requestBody.language_detection_options.fallback_language).toBe("en"); + expect(requestBody.language_detection_options.on_low_language_confidence).toBe("fallback"); + }); + + it("should create transcript with on_low_language_confidence set to error", async () => { + const languageDetectionOptions: LanguageDetectionOptions = { + fallback_language: "en", + on_low_language_confidence: "error", + }; + + fetchMock.doMockOnceIf( + requestMatches({ url: "/v2/transcript", method: "POST" }), + JSON.stringify({ id: transcriptId, status: "queued" }), + ); + + const transcript = await assembly.transcripts.submit({ + audio_url: remoteAudioURL, + language_detection: true, + language_confidence_threshold: 0.7, + language_detection_options: languageDetectionOptions, + }); + + expect(transcript.id).toBe(transcriptId); + + const requestBody = JSON.parse(fetchMock.mock.calls[0][1]?.body as string); + expect(requestBody.language_confidence_threshold).toBe(0.7); + expect(requestBody.language_detection_options.fallback_language).toBe("en"); + expect(requestBody.language_detection_options.on_low_language_confidence).toBe("error"); + }); });