Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/voice-gapless-chunk-playback.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@cloudflare/voice": patch
---

Fix audible clicks at audio chunk boundaries during agent speech. `VoiceClient` played each response chunk by starting it at `currentTime` and waiting for its `ended` event before scheduling the next, so every chunk seam carried a few milliseconds of silence (event-loop latency plus the next chunk's setup) — audible as a periodic click, roughly one per chunk. Chunks are now scheduled back-to-back on the audio clock via a playback cursor (`start(Math.max(currentTime, cursor))`), so consecutive chunks butt together sample-tight. Because chunks can now be scheduled ahead of playback, the client tracks every scheduled source and stops them all on interrupt/end-call (previously only the single active source needed stopping), and playback counts as active until the last scheduled chunk finishes so barge-in detection keeps working through the scheduled tail.
178 changes: 146 additions & 32 deletions packages/voice/src/tests/voice-client.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,16 @@ class FakeAudioBufferSourceNode {
onended: (() => void) | null = null;
stopped = false;
started = false;
startedAt: number | null = null;
connectedTo: unknown = null;

connect(destination: unknown): void {
this.connectedTo = destination;
}

start(): void {
start(when?: number): void {
this.started = true;
this.startedAt = when ?? null;
}

stop(): void {
Expand All @@ -58,7 +60,9 @@ class FakeAudioBufferSourceNode {

class FakeAudioContext {
state: AudioContextState = "running";
currentTime = 0;
source: FakeAudioBufferSourceNode | null = null;
sources: FakeAudioBufferSourceNode[] = [];
deferDecode = false;
pendingDecode: (() => void) | null = null;
destination = {};
Expand All @@ -70,14 +74,27 @@ class FakeAudioContext {
async close(): Promise<void> {}

async decodeAudioData(_audioData: ArrayBuffer): Promise<AudioBuffer> {
if (!this.deferDecode) return {} as AudioBuffer;
const decoded = { duration: 0.5 } as AudioBuffer;
if (!this.deferDecode) return decoded;
return new Promise((resolve) => {
this.pendingDecode = () => resolve({} as AudioBuffer);
this.pendingDecode = () => resolve(decoded);
});
}

createBuffer(
_channels: number,
length: number,
sampleRate: number
): AudioBuffer {
return {
duration: length / sampleRate,
getChannelData: () => new Float32Array(length)
} as unknown as AudioBuffer;
}

createBufferSource(): AudioBufferSourceNode {
this.source = new FakeAudioBufferSourceNode();
this.sources.push(this.source);
return this.source as unknown as AudioBufferSourceNode;
}

Expand Down Expand Up @@ -180,41 +197,53 @@ async function waitForPlayCount(count: number): Promise<void> {
throw new Error(`expected audio play count to reach ${count}`);
}

describe("VoiceClient playback interrupt", () => {
beforeEach(() => {
originalAudioContext = globalThis.AudioContext;
originalAudio = globalThis.Audio;
audioContext = new FakeAudioContext();
audioElement = new FakeAudioElement();
Object.defineProperty(globalThis, "AudioContext", {
configurable: true,
value: class {
constructor() {
return audioContext;
}
async function waitForSourceCount(
count: number
): Promise<FakeAudioBufferSourceNode[]> {
for (let i = 0; i < 20; i++) {
if (audioContext.sources.length >= count) return audioContext.sources;
await Promise.resolve();
}
throw new Error(
`expected ${count} audio sources, got ${audioContext.sources.length}`
);
}

beforeEach(() => {
originalAudioContext = globalThis.AudioContext;
originalAudio = globalThis.Audio;
audioContext = new FakeAudioContext();
audioElement = new FakeAudioElement();
Object.defineProperty(globalThis, "AudioContext", {
configurable: true,
value: class {
constructor() {
return audioContext;
}
});
Object.defineProperty(globalThis, "Audio", {
configurable: true,
value: class {
constructor() {
return audioElement;
}
}
});
Object.defineProperty(globalThis, "Audio", {
configurable: true,
value: class {
constructor() {
return audioElement;
}
});
}
});
});

afterEach(() => {
Object.defineProperty(globalThis, "AudioContext", {
configurable: true,
value: originalAudioContext
});
Object.defineProperty(globalThis, "Audio", {
configurable: true,
value: originalAudio
});
afterEach(() => {
Object.defineProperty(globalThis, "AudioContext", {
configurable: true,
value: originalAudioContext
});
Object.defineProperty(globalThis, "Audio", {
configurable: true,
value: originalAudio
});
});

describe("VoiceClient playback interrupt", () => {
it("stops active playback when the server sends playback_interrupt", async () => {
const transport = new MockTransport();
const client = new VoiceClient({ agent: "test-agent", transport });
Expand Down Expand Up @@ -618,3 +647,88 @@ describe("VoiceClient playback interrupt", () => {
expect(audioContext.source).toBeNull();
});
});

describe("VoiceClient gapless playback", () => {
// 1600 samples of 16-bit PCM = 0.1s at 16kHz
function pcm16Chunk(): ArrayBuffer {
return new ArrayBuffer(1600 * 2);
}

function startPcm16Call(): { transport: MockTransport; client: VoiceClient } {
const transport = new MockTransport();
const client = new VoiceClient({ agent: "test-agent", transport });
client.connect();
transport.receive(
JSON.stringify({ type: "audio_config", format: "pcm16" })
);
return { transport, client };
}

it("schedules consecutive chunks back-to-back instead of waiting for ended", async () => {
const { transport } = startPcm16Call();
audioContext.currentTime = 5;

transport.receive(pcm16Chunk());
transport.receive(pcm16Chunk());
const sources = await waitForSourceCount(2);

// The second chunk is scheduled while the first is still playing,
// starting exactly where the first ends on the audio clock.
expect(sources[0].startedAt).toBe(5);
expect(sources[0].stopped).toBe(false);
expect(sources[1].startedAt).toBeCloseTo(5.1, 10);
});

it("starts at the current time when playback has fallen behind the cursor", async () => {
const { transport } = startPcm16Call();
audioContext.currentTime = 5;
transport.receive(pcm16Chunk());
await waitForSourceCount(1);

audioContext.currentTime = 7; // well past the first chunk's end
transport.receive(pcm16Chunk());
const sources = await waitForSourceCount(2);

expect(sources[1].startedAt).toBe(7);
});

it("stops every scheduled chunk on playback_interrupt", async () => {
const { transport } = startPcm16Call();
transport.receive(pcm16Chunk());
transport.receive(pcm16Chunk());
transport.receive(pcm16Chunk());
const sources = await waitForSourceCount(3);

transport.receive(JSON.stringify({ type: "playback_interrupt" }));

expect(sources.every((source) => source.stopped)).toBe(true);
});

it("still treats playback as active after the queue drains, so a user transcript interrupts the scheduled tail", async () => {
const { transport } = startPcm16Call();
transport.receive(pcm16Chunk());
transport.receive(pcm16Chunk());
const sources = await waitForSourceCount(2);

transport.receive(
JSON.stringify({ type: "transcript", role: "user", text: "hold on" })
);

expect(sources.every((source) => source.stopped)).toBe(true);
});

it("resets the playback cursor when a call ends", async () => {
const { transport, client } = startPcm16Call();
audioContext.currentTime = 5;
transport.receive(pcm16Chunk());
await waitForSourceCount(1);

client.endCall();
audioContext.currentTime = 2;
transport.receive(pcm16Chunk());
const sources = await waitForSourceCount(2);

// Without the reset this would start at the stale 5.1 cursor.
expect(sources[1].startedAt).toBe(2);
});
});
50 changes: 33 additions & 17 deletions packages/voice/src/voice-client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,9 @@ export class VoiceClient {
#isSpeaking = false;
#playbackQueue: ArrayBuffer[] = [];
#isPlaying = false;
#activeSource: AudioBufferSourceNode | null = null;
#isScheduling = false;
#scheduledSources = new Set<AudioBufferSourceNode>();
#playbackCursor = 0;
#playbackElement: HTMLAudioElement | null = null;
#playbackDestination: MediaStreamAudioDestinationNode | null = null;
#playbackDestinationPromise: Promise<AudioNode> | null = null;
Expand Down Expand Up @@ -909,25 +911,34 @@ export class VoiceClient {
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
source.connect(destination);
if (generation !== this.#playbackGeneration) return;
this.#activeSource = source;
this.#scheduledSources.add(source);
source.onended = () => {
this.#scheduledSources.delete(source);
if (
generation === this.#playbackGeneration &&
!this.#isScheduling &&
this.#scheduledSources.size === 0 &&
this.#playbackQueue.length === 0
) {
this.#isPlaying = false;
}
};

return new Promise<void>((resolve) => {
source.onended = () => {
if (this.#activeSource === source) {
this.#activeSource = null;
}
resolve();
};
source.start();
});
// Schedule on the audio clock, butted against the previous chunk.
// Starting at currentTime only after the previous chunk's `ended`
// event leaves a few ms of silence at every chunk seam, audible as
// a periodic click during agent speech.
const startAt = Math.max(ctx.currentTime, this.#playbackCursor);
this.#playbackCursor = startAt + audioBuffer.duration;
source.start(startAt);
} catch (err) {
console.error("[VoiceClient] Audio playback error:", err);
}
}

async #processPlaybackQueue(): Promise<void> {
if (this.#isPlaying || this.#playbackQueue.length === 0) return;
if (this.#isScheduling || this.#playbackQueue.length === 0) return;
this.#isScheduling = true;
this.#isPlaying = true;
const generation = this.#playbackGeneration;

Expand All @@ -940,15 +951,18 @@ export class VoiceClient {
}

if (generation === this.#playbackGeneration) {
this.#isPlaying = false;
this.#isScheduling = false;
if (this.#scheduledSources.size === 0) {
this.#isPlaying = false;
}
}
}

#stopPlayback(): void {
const source = this.#activeSource;
this.#playbackGeneration++;
this.#activeSource = null;
if (source) {
const sources = [...this.#scheduledSources];
this.#scheduledSources.clear();
for (const source of sources) {
try {
source.stop();
} catch {
Expand All @@ -957,6 +971,8 @@ export class VoiceClient {
}
this.#playbackQueue = [];
this.#isPlaying = false;
this.#isScheduling = false;
this.#playbackCursor = 0;
}

// --- Mic capture ---
Expand Down
Loading