cloudflare · cjol · Jun 12, 2026 · Jun 12, 2026
diff --git a/.changeset/voice-gapless-chunk-playback.md b/.changeset/voice-gapless-chunk-playback.md
@@ -0,0 +1,5 @@
+---
+"@cloudflare/voice": patch
+---
+
+Fix audible clicks at audio chunk boundaries during agent speech. `VoiceClient` played each response chunk by starting it at `currentTime` and waiting for its `ended` event before scheduling the next, so every chunk seam carried a few milliseconds of silence (event-loop latency plus the next chunk's setup) — audible as a periodic click, roughly one per chunk. Chunks are now scheduled back-to-back on the audio clock via a playback cursor (`start(Math.max(currentTime, cursor))`), so consecutive chunks butt together sample-tight. Because chunks can now be scheduled ahead of playback, the client tracks every scheduled source and stops them all on interrupt/end-call (previously only the single active source needed stopping), and playback counts as active until the last scheduled chunk finishes so barge-in detection keeps working through the scheduled tail.
diff --git a/packages/voice/src/tests/voice-client.test.ts b/packages/voice/src/tests/voice-client.test.ts
@@ -39,14 +39,16 @@ class FakeAudioBufferSourceNode {
   onended: (() => void) | null = null;
   stopped = false;
   started = false;
+  startedAt: number | null = null;
   connectedTo: unknown = null;
 
   connect(destination: unknown): void {
     this.connectedTo = destination;
   }
 
-  start(): void {
+  start(when?: number): void {
     this.started = true;
+    this.startedAt = when ?? null;
   }
 
   stop(): void {
@@ -58,7 +60,9 @@ class FakeAudioBufferSourceNode {
 
 class FakeAudioContext {
   state: AudioContextState = "running";
+  currentTime = 0;
   source: FakeAudioBufferSourceNode | null = null;
+  sources: FakeAudioBufferSourceNode[] = [];
   deferDecode = false;
   pendingDecode: (() => void) | null = null;
   destination = {};
@@ -70,14 +74,27 @@ class FakeAudioContext {
   async close(): Promise<void> {}
 
   async decodeAudioData(_audioData: ArrayBuffer): Promise<AudioBuffer> {
-    if (!this.deferDecode) return {} as AudioBuffer;
+    const decoded = { duration: 0.5 } as AudioBuffer;
+    if (!this.deferDecode) return decoded;
     return new Promise((resolve) => {
-      this.pendingDecode = () => resolve({} as AudioBuffer);
+      this.pendingDecode = () => resolve(decoded);
     });
   }
 
+  createBuffer(
+    _channels: number,
+    length: number,
+    sampleRate: number
+  ): AudioBuffer {
+    return {
+      duration: length / sampleRate,
+      getChannelData: () => new Float32Array(length)
+    } as unknown as AudioBuffer;
+  }
+
   createBufferSource(): AudioBufferSourceNode {
     this.source = new FakeAudioBufferSourceNode();
+    this.sources.push(this.source);
     return this.source as unknown as AudioBufferSourceNode;
   }
 
@@ -180,41 +197,53 @@ async function waitForPlayCount(count: number): Promise<void> {
   throw new Error(`expected audio play count to reach ${count}`);
 }
 
-describe("VoiceClient playback interrupt", () => {
-  beforeEach(() => {
-    originalAudioContext = globalThis.AudioContext;
-    originalAudio = globalThis.Audio;
-    audioContext = new FakeAudioContext();
-    audioElement = new FakeAudioElement();
-    Object.defineProperty(globalThis, "AudioContext", {
-      configurable: true,
-      value: class {
-        constructor() {
-          return audioContext;
-        }
+async function waitForSourceCount(
+  count: number
+): Promise<FakeAudioBufferSourceNode[]> {
+  for (let i = 0; i < 20; i++) {
+    if (audioContext.sources.length >= count) return audioContext.sources;
+    await Promise.resolve();
+  }
+  throw new Error(
+    `expected ${count} audio sources, got ${audioContext.sources.length}`
+  );
+}
+
+beforeEach(() => {
+  originalAudioContext = globalThis.AudioContext;
+  originalAudio = globalThis.Audio;
+  audioContext = new FakeAudioContext();
+  audioElement = new FakeAudioElement();
+  Object.defineProperty(globalThis, "AudioContext", {
+    configurable: true,
+    value: class {
+      constructor() {
+        return audioContext;
       }
-    });
-    Object.defineProperty(globalThis, "Audio", {
-      configurable: true,
-      value: class {
-        constructor() {
-          return audioElement;
-        }
+    }
+  });
+  Object.defineProperty(globalThis, "Audio", {
+    configurable: true,
+    value: class {
+      constructor() {
+        return audioElement;
       }
-    });
+    }
   });
+});
 
-  afterEach(() => {
-    Object.defineProperty(globalThis, "AudioContext", {
-      configurable: true,
-      value: originalAudioContext
-    });
-    Object.defineProperty(globalThis, "Audio", {
-      configurable: true,
-      value: originalAudio
-    });
+afterEach(() => {
+  Object.defineProperty(globalThis, "AudioContext", {
+    configurable: true,
+    value: originalAudioContext
   });
+  Object.defineProperty(globalThis, "Audio", {
+    configurable: true,
+    value: originalAudio
+  });
+});
 
+describe("VoiceClient playback interrupt", () => {
   it("stops active playback when the server sends playback_interrupt", async () => {
     const transport = new MockTransport();
     const client = new VoiceClient({ agent: "test-agent", transport });
@@ -618,3 +647,88 @@ describe("VoiceClient playback interrupt", () => {
     expect(audioContext.source).toBeNull();
   });
 });
+
+describe("VoiceClient gapless playback", () => {
+  // 1600 samples of 16-bit PCM = 0.1s at 16kHz
+  function pcm16Chunk(): ArrayBuffer {
+    return new ArrayBuffer(1600 * 2);
+  }
+
+  function startPcm16Call(): { transport: MockTransport; client: VoiceClient } {
+    const transport = new MockTransport();
+    const client = new VoiceClient({ agent: "test-agent", transport });
+    client.connect();
+    transport.receive(
+      JSON.stringify({ type: "audio_config", format: "pcm16" })
+    );
+    return { transport, client };
+  }
+
+  it("schedules consecutive chunks back-to-back instead of waiting for ended", async () => {
+    const { transport } = startPcm16Call();
+    audioContext.currentTime = 5;
+
+    transport.receive(pcm16Chunk());
+    transport.receive(pcm16Chunk());
+    const sources = await waitForSourceCount(2);
+
+    // The second chunk is scheduled while the first is still playing,
+    // starting exactly where the first ends on the audio clock.
+    expect(sources[0].startedAt).toBe(5);
+    expect(sources[0].stopped).toBe(false);
+    expect(sources[1].startedAt).toBeCloseTo(5.1, 10);
+  });
+
+  it("starts at the current time when playback has fallen behind the cursor", async () => {
+    const { transport } = startPcm16Call();
+    audioContext.currentTime = 5;
+    transport.receive(pcm16Chunk());
+    await waitForSourceCount(1);
+
+    audioContext.currentTime = 7; // well past the first chunk's end
+    transport.receive(pcm16Chunk());
+    const sources = await waitForSourceCount(2);
+
+    expect(sources[1].startedAt).toBe(7);
+  });
+
+  it("stops every scheduled chunk on playback_interrupt", async () => {
+    const { transport } = startPcm16Call();
+    transport.receive(pcm16Chunk());
+    transport.receive(pcm16Chunk());
+    transport.receive(pcm16Chunk());
+    const sources = await waitForSourceCount(3);
+
+    transport.receive(JSON.stringify({ type: "playback_interrupt" }));
+
+    expect(sources.every((source) => source.stopped)).toBe(true);
+  });
+
+  it("still treats playback as active after the queue drains, so a user transcript interrupts the scheduled tail", async () => {
+    const { transport } = startPcm16Call();
+    transport.receive(pcm16Chunk());
+    transport.receive(pcm16Chunk());
+    const sources = await waitForSourceCount(2);
+
+    transport.receive(
+      JSON.stringify({ type: "transcript", role: "user", text: "hold on" })
+    );
+
+    expect(sources.every((source) => source.stopped)).toBe(true);
+  });
+
+  it("resets the playback cursor when a call ends", async () => {
+    const { transport, client } = startPcm16Call();
+    audioContext.currentTime = 5;
+    transport.receive(pcm16Chunk());
+    await waitForSourceCount(1);
+
+    client.endCall();
+    audioContext.currentTime = 2;
+    transport.receive(pcm16Chunk());
+    const sources = await waitForSourceCount(2);
+
+    // Without the reset this would start at the stale 5.1 cursor.
+    expect(sources[1].startedAt).toBe(2);
+  });
+});
diff --git a/packages/voice/src/voice-client.ts b/packages/voice/src/voice-client.ts
@@ -293,7 +293,9 @@ export class VoiceClient {
   #isSpeaking = false;
   #playbackQueue: ArrayBuffer[] = [];
   #isPlaying = false;
-  #activeSource: AudioBufferSourceNode | null = null;
+  #isScheduling = false;
+  #scheduledSources = new Set<AudioBufferSourceNode>();
+  #playbackCursor = 0;
   #playbackElement: HTMLAudioElement | null = null;
   #playbackDestination: MediaStreamAudioDestinationNode | null = null;
   #playbackDestinationPromise: Promise<AudioNode> | null = null;
@@ -909,25 +911,34 @@ export class VoiceClient {
       const source = ctx.createBufferSource();
       source.buffer = audioBuffer;
       source.connect(destination);
-      if (generation !== this.#playbackGeneration) return;
-      this.#activeSource = source;
+      this.#scheduledSources.add(source);
+      source.onended = () => {
+        this.#scheduledSources.delete(source);
+        if (
+          generation === this.#playbackGeneration &&
+          !this.#isScheduling &&
+          this.#scheduledSources.size === 0 &&
+          this.#playbackQueue.length === 0
+        ) {
+          this.#isPlaying = false;
+        }
+      };
 
-      return new Promise<void>((resolve) => {
-        source.onended = () => {
-          if (this.#activeSource === source) {
-            this.#activeSource = null;
-          }
-          resolve();
-        };
-        source.start();
-      });
+      // Schedule on the audio clock, butted against the previous chunk.
+      // Starting at currentTime only after the previous chunk's `ended`
+      // event leaves a few ms of silence at every chunk seam, audible as
+      // a periodic click during agent speech.
+      const startAt = Math.max(ctx.currentTime, this.#playbackCursor);
+      this.#playbackCursor = startAt + audioBuffer.duration;
+      source.start(startAt);
     } catch (err) {
       console.error("[VoiceClient] Audio playback error:", err);
     }
   }
 
   async #processPlaybackQueue(): Promise<void> {
-    if (this.#isPlaying || this.#playbackQueue.length === 0) return;
+    if (this.#isScheduling || this.#playbackQueue.length === 0) return;
+    this.#isScheduling = true;
     this.#isPlaying = true;
     const generation = this.#playbackGeneration;
 
@@ -940,15 +951,18 @@ export class VoiceClient {
     }
 
     if (generation === this.#playbackGeneration) {
-      this.#isPlaying = false;
+      this.#isScheduling = false;
+      if (this.#scheduledSources.size === 0) {
+        this.#isPlaying = false;
+      }
     }
   }
 
   #stopPlayback(): void {
-    const source = this.#activeSource;
     this.#playbackGeneration++;
-    this.#activeSource = null;
-    if (source) {
+    const sources = [...this.#scheduledSources];
+    this.#scheduledSources.clear();
+    for (const source of sources) {
       try {
         source.stop();
       } catch {
@@ -957,6 +971,8 @@ export class VoiceClient {
     }
     this.#playbackQueue = [];
     this.#isPlaying = false;
+    this.#isScheduling = false;
+    this.#playbackCursor = 0;
   }
 
   // --- Mic capture ---