From 0e6a8734bfcdd03b84e4798e35e36cc0aa08c203 Mon Sep 17 00:00:00 2001 From: Gabriel Guralnick Date: Tue, 31 Mar 2026 16:06:04 -0700 Subject: [PATCH 1/3] [Web] Pre-allocate TypedArray views for pod args in WebGPU dispatch Hoist Int32Array/Uint32Array/Float32Array allocation out of the per-dispatch submitShader closure into the per-shader scope. Since podArgIndices.length is fixed for each shader, the views can be safely reused: every slot (0..podArgIndices.length) is written on each dispatch before writeBuffer copies the data, so no stale values can leak between invocations. This avoids 3 heap allocations + 1 ArrayBuffer per GPU kernel dispatch, which adds up in workloads with many small dispatches (e.g. LLM token generation). --- web/src/webgpu.ts | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts index 55d188516d40..d640d3defd5f 100644 --- a/web/src/webgpu.ts +++ b/web/src/webgpu.ts @@ -697,6 +697,13 @@ export class WebGPUContext { bindGroupLayouts: [bindGroupLayout] }); + // Pre-allocate typed array views for pod args (reused across dispatches) + const maxPodArgs = podArgIndices.length + 1; // +1 for packGridDimX + const podArgsArrayBuffer = new ArrayBuffer(maxPodArgs * 4); + const i32ViewCached = new Int32Array(podArgsArrayBuffer); + const u32ViewCached = new Uint32Array(podArgsArrayBuffer); + const f32ViewCached = new Float32Array(podArgsArrayBuffer); + // Function to create the pipeline. const createShaderFunc = (pipeline: GPUComputePipeline): Function => { const submitShader = (...args: Array): void => { @@ -759,32 +766,28 @@ export class WebGPUContext { const sizeOfI32 = 4; const bufBytes = (podArgIndices.length + 1) * sizeOfI32; const podArgBuffer = this.getUniformFromPool(bufBytes); - const i32View = new Int32Array(podArgIndices.length + 1); - const u32View = new Uint32Array(i32View.buffer); - const f32View = new Float32Array(i32View.buffer); for (let i = 0; i < podArgIndices.length; ++i) { const value = args[podArgIndices[i]]; const dtype = finfo.arg_types[podArgIndices[i]]; if (dtype.startsWith("int")) { - i32View[i] = value; + i32ViewCached[i] = value; } else if (dtype.startsWith("uint")) { - u32View[i] = value; + u32ViewCached[i] = value; } else if (dtype.startsWith("float")) { - f32View[i] = value; + f32ViewCached[i] = value; } else { throw Error("Unknown pod dtype " + dtype); } } - // always pass in dim z launching grid size in - u32View[podArgIndices.length] = packDimX; - this.device.queue.writeBuffer(podArgBuffer, 0, i32View.buffer); + u32ViewCached[podArgIndices.length] = packDimX; + this.device.queue.writeBuffer(podArgBuffer, 0, podArgsArrayBuffer); bindGroupEntries.push({ binding: bufferArgIndices.length, resource: { buffer: podArgBuffer, - size: i32View.buffer.byteLength + size: podArgsArrayBuffer.byteLength } }); From ccb9b7774dc7f6a0e5e9dd17b58999195fe5fc9f Mon Sep 17 00:00:00 2001 From: Gabriel Guralnick Date: Wed, 1 Apr 2026 12:07:42 -0700 Subject: [PATCH 2/3] Address review: use BYTES_PER_ELEMENT, hoist podArgBytes --- web/src/webgpu.ts | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts index d640d3defd5f..b602f7d9d810 100644 --- a/web/src/webgpu.ts +++ b/web/src/webgpu.ts @@ -698,8 +698,9 @@ export class WebGPUContext { }); // Pre-allocate typed array views for pod args (reused across dispatches) - const maxPodArgs = podArgIndices.length + 1; // +1 for packGridDimX - const podArgsArrayBuffer = new ArrayBuffer(maxPodArgs * 4); + const maxPodArgs = podArgIndices.length + 1; // +1 for packDimX + const podArgBytes = maxPodArgs * Int32Array.BYTES_PER_ELEMENT; + const podArgsArrayBuffer = new ArrayBuffer(podArgBytes); const i32ViewCached = new Int32Array(podArgsArrayBuffer); const u32ViewCached = new Uint32Array(podArgsArrayBuffer); const f32ViewCached = new Float32Array(podArgsArrayBuffer); @@ -763,9 +764,7 @@ export class WebGPUContext { }); } - const sizeOfI32 = 4; - const bufBytes = (podArgIndices.length + 1) * sizeOfI32; - const podArgBuffer = this.getUniformFromPool(bufBytes); + const podArgBuffer = this.getUniformFromPool(podArgBytes); for (let i = 0; i < podArgIndices.length; ++i) { const value = args[podArgIndices[i]]; From 7d850cfce16def55c91d2bef70d82c91890fc043 Mon Sep 17 00:00:00 2001 From: Gabriel Guralnick Date: Thu, 2 Apr 2026 10:42:00 -0700 Subject: [PATCH 3/3] Address review: rename maxPodArgs, restore packDimX comment Rename maxPodArgs to numPodSlots for clarity (it's a count, not a maximum) and restore an explanatory comment for the packDimX uniform slot assignment. --- web/src/webgpu.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts index b602f7d9d810..0f5ba7bcf1cd 100644 --- a/web/src/webgpu.ts +++ b/web/src/webgpu.ts @@ -698,8 +698,8 @@ export class WebGPUContext { }); // Pre-allocate typed array views for pod args (reused across dispatches) - const maxPodArgs = podArgIndices.length + 1; // +1 for packDimX - const podArgBytes = maxPodArgs * Int32Array.BYTES_PER_ELEMENT; + const numPodSlots = podArgIndices.length + 1; // +1 for packDimX + const podArgBytes = numPodSlots * Int32Array.BYTES_PER_ELEMENT; const podArgsArrayBuffer = new ArrayBuffer(podArgBytes); const i32ViewCached = new Int32Array(podArgsArrayBuffer); const u32ViewCached = new Uint32Array(podArgsArrayBuffer); @@ -779,6 +779,7 @@ export class WebGPUContext { throw Error("Unknown pod dtype " + dtype); } } + // Pass the original grid X dimension so the shader can recover blockIdx.x from the z-split u32ViewCached[podArgIndices.length] = packDimX; this.device.queue.writeBuffer(podArgBuffer, 0, podArgsArrayBuffer);