From 0e6a8734bfcdd03b84e4798e35e36cc0aa08c203 Mon Sep 17 00:00:00 2001
From: Gabriel Guralnick <gabriel@imbue.com>
Date: Tue, 31 Mar 2026 16:06:04 -0700
Subject: [PATCH 1/3] [Web] Pre-allocate TypedArray views for pod args in
 WebGPU dispatch

Hoist Int32Array/Uint32Array/Float32Array allocation out of the
per-dispatch submitShader closure into the per-shader scope. Since
podArgIndices.length is fixed for each shader, the views can be
safely reused: every slot (0..podArgIndices.length) is written on
each dispatch before writeBuffer copies the data, so no stale
values can leak between invocations.

This avoids 3 heap allocations + 1 ArrayBuffer per GPU kernel
dispatch, which adds up in workloads with many small dispatches
(e.g. LLM token generation).
---
 web/src/webgpu.ts | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index 55d188516d40..d640d3defd5f 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -697,6 +697,13 @@ export class WebGPUContext {
       bindGroupLayouts: [bindGroupLayout]
     });
 
+    // Pre-allocate typed array views for pod args (reused across dispatches)
+    const maxPodArgs = podArgIndices.length + 1; // +1 for packGridDimX
+    const podArgsArrayBuffer = new ArrayBuffer(maxPodArgs * 4);
+    const i32ViewCached = new Int32Array(podArgsArrayBuffer);
+    const u32ViewCached = new Uint32Array(podArgsArrayBuffer);
+    const f32ViewCached = new Float32Array(podArgsArrayBuffer);
+
     // Function to create the pipeline.
     const createShaderFunc = (pipeline: GPUComputePipeline): Function => {
       const submitShader = (...args: Array<GPUPointer | number>): void => {
@@ -759,32 +766,28 @@ export class WebGPUContext {
         const sizeOfI32 = 4;
         const bufBytes = (podArgIndices.length + 1) * sizeOfI32;
         const podArgBuffer = this.getUniformFromPool(bufBytes);
-        const i32View = new Int32Array(podArgIndices.length + 1);
-        const u32View = new Uint32Array(i32View.buffer);
-        const f32View = new Float32Array(i32View.buffer);
 
         for (let i = 0; i < podArgIndices.length; ++i) {
           const value = args[podArgIndices[i]];
           const dtype = finfo.arg_types[podArgIndices[i]];
           if (dtype.startsWith("int")) {
-            i32View[i] = value;
+            i32ViewCached[i] = value;
           } else if (dtype.startsWith("uint")) {
-            u32View[i] = value;
+            u32ViewCached[i] = value;
           } else if (dtype.startsWith("float")) {
-            f32View[i] = value;
+            f32ViewCached[i] = value;
           } else {
             throw Error("Unknown pod dtype " + dtype);
           }
         }
-        // always pass in dim z launching grid size in
-        u32View[podArgIndices.length] = packDimX;
-        this.device.queue.writeBuffer(podArgBuffer, 0, i32View.buffer);
+        u32ViewCached[podArgIndices.length] = packDimX;
+        this.device.queue.writeBuffer(podArgBuffer, 0, podArgsArrayBuffer);
 
         bindGroupEntries.push({
           binding: bufferArgIndices.length,
           resource: {
             buffer: podArgBuffer,
-            size: i32View.buffer.byteLength
+            size: podArgsArrayBuffer.byteLength
           }
         });
 

From ccb9b7774dc7f6a0e5e9dd17b58999195fe5fc9f Mon Sep 17 00:00:00 2001
From: Gabriel Guralnick <gabriel@imbue.com>
Date: Wed, 1 Apr 2026 12:07:42 -0700
Subject: [PATCH 2/3] Address review: use BYTES_PER_ELEMENT, hoist podArgBytes

---
 web/src/webgpu.ts | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index d640d3defd5f..b602f7d9d810 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -698,8 +698,9 @@ export class WebGPUContext {
     });
 
     // Pre-allocate typed array views for pod args (reused across dispatches)
-    const maxPodArgs = podArgIndices.length + 1; // +1 for packGridDimX
-    const podArgsArrayBuffer = new ArrayBuffer(maxPodArgs * 4);
+    const maxPodArgs = podArgIndices.length + 1; // +1 for packDimX
+    const podArgBytes = maxPodArgs * Int32Array.BYTES_PER_ELEMENT;
+    const podArgsArrayBuffer = new ArrayBuffer(podArgBytes);
     const i32ViewCached = new Int32Array(podArgsArrayBuffer);
     const u32ViewCached = new Uint32Array(podArgsArrayBuffer);
     const f32ViewCached = new Float32Array(podArgsArrayBuffer);
@@ -763,9 +764,7 @@ export class WebGPUContext {
           });
         }
 
-        const sizeOfI32 = 4;
-        const bufBytes = (podArgIndices.length + 1) * sizeOfI32;
-        const podArgBuffer = this.getUniformFromPool(bufBytes);
+        const podArgBuffer = this.getUniformFromPool(podArgBytes);
 
         for (let i = 0; i < podArgIndices.length; ++i) {
           const value = args[podArgIndices[i]];

From 7d850cfce16def55c91d2bef70d82c91890fc043 Mon Sep 17 00:00:00 2001
From: Gabriel Guralnick <gabriel@imbue.com>
Date: Thu, 2 Apr 2026 10:42:00 -0700
Subject: [PATCH 3/3] Address review: rename maxPodArgs, restore packDimX
 comment

Rename maxPodArgs to numPodSlots for clarity (it's a count, not a
maximum) and restore an explanatory comment for the packDimX uniform
slot assignment.
---
 web/src/webgpu.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/web/src/webgpu.ts b/web/src/webgpu.ts
index b602f7d9d810..0f5ba7bcf1cd 100644
--- a/web/src/webgpu.ts
+++ b/web/src/webgpu.ts
@@ -698,8 +698,8 @@ export class WebGPUContext {
     });
 
     // Pre-allocate typed array views for pod args (reused across dispatches)
-    const maxPodArgs = podArgIndices.length + 1; // +1 for packDimX
-    const podArgBytes = maxPodArgs * Int32Array.BYTES_PER_ELEMENT;
+    const numPodSlots = podArgIndices.length + 1; // +1 for packDimX
+    const podArgBytes = numPodSlots * Int32Array.BYTES_PER_ELEMENT;
     const podArgsArrayBuffer = new ArrayBuffer(podArgBytes);
     const i32ViewCached = new Int32Array(podArgsArrayBuffer);
     const u32ViewCached = new Uint32Array(podArgsArrayBuffer);
@@ -779,6 +779,7 @@ export class WebGPUContext {
             throw Error("Unknown pod dtype " + dtype);
           }
         }
+        // Pass the original grid X dimension so the shader can recover blockIdx.x from the z-split
         u32ViewCached[podArgIndices.length] = packDimX;
         this.device.queue.writeBuffer(podArgBuffer, 0, podArgsArrayBuffer);