From af484ca71cc5c7fe65a670758ad6455d70dc5564 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 15 Jun 2026 07:54:15 -0700
Subject: [PATCH 1/4] Fix webgpu native SDPA test: select attention output by
 position, not numel

The sdpa_with_kv_cache sweep picked the attention output among the
mutating op's outputs by matching element count. For llama1b_prefill
(Hq=32,Hkv=8,D=64,S=128,Cmax=512) the attention output (S*Hq*D=262144)
shares its numel with both the k and v caches (Cmax*Hkv*D=262144, since
S*Hq==Cmax*Hkv), so three tensors matched and the test failed with
'ambiguous attention output'. The op returns [k_cache, v_cache,
attn_output] in a fixed order, so select the last output instead and
keep a numel sanity check. Test-only; the kernel is correct.
---
 backends/webgpu/test/test_webgpu_native.cpp | 29 +++++++--------------
 1 file changed, 10 insertions(+), 19 deletions(-)
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index 338ecb39913..78ee955dbd9 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -662,32 +662,23 @@ static bool test_sdpa_config(
   }
 
   const auto& outputs = result.get();
-  // The mutating op returns [k_cache, v_cache, attn_output]; select the
-  // attention output (numel == S*Hq*D), not a mutated cache (numel Cmax*Hkv*D).
-  // Count matches and fail if ambiguous: a cache could share the same numel.
-  int attn_idx = -1;
-  int attn_matches = 0;
-  for (size_t i = 0; i < outputs.size(); i++) {
-    if (outputs[i].isTensor() && outputs[i].toTensor().numel() == on) {
-      attn_idx = static_cast<int>(i);
-      attn_matches++;
-    }
-  }
-  if (attn_idx < 0) {
+  // The mutating op returns [k_cache, v_cache, attn_output] in a fixed order,
+  // so the attention output is always the last output. Selecting it by element
+  // count is unsafe: a mutated cache (numel Cmax*Hkv*D) can share the attention
+  // output's numel (S*Hq*D) whenever S*Hq == Cmax*Hkv (e.g. llama1b_prefill).
+  if (outputs.empty() || !outputs.back().isTensor()) {
     printf(
-        "FAIL: no attention output (numel %d) among %zu outputs\n",
-        on,
-        outputs.size());
+        "FAIL: missing attention output among %zu outputs\n", outputs.size());
     return false;
   }
-  if (attn_matches > 1) {
+  const auto& out_tensor = outputs.back().toTensor();
+  if (out_tensor.numel() != on) {
     printf(
-        "FAIL: ambiguous attention output: %d tensors match numel %d\n",
-        attn_matches,
+        "FAIL: attention output numel %d != expected %d\n",
+        (int)out_tensor.numel(),
         on);
     return false;
   }
-  const auto& out_tensor = outputs[attn_idx].toTensor();
   const float* out_data = out_tensor.const_data_ptr<float>();
 
   std::vector<float> golden = load_golden(golden_path, on);

From 71200facc1112995f83b05e689aec0cab47e9d93 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 15 Jun 2026 08:06:17 -0700
Subject: [PATCH 2/4] Address review: select attn output by position with count
 + per-slot numel asserts

Use the documented ExecuTorch output order [*mutated_inputs, *user_outputs] = [k_cache, v_cache, attn_output] and select the attention output by position (index 2) instead of by element count. Assert outputs.size()==3, that each output is a tensor, and that the two cache slots have numel Cmax*Hkv*D, so a future change in output arity/order fails loudly. Print numel as %zu/size_t to match the rest of the file and avoid truncation. Addresses Copilot review comments.
---
 backends/webgpu/test/test_webgpu_native.cpp | 41 ++++++++++++++++-----
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index 78ee955dbd9..a42cc067f82 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -662,20 +662,43 @@ static bool test_sdpa_config(
   }
 
   const auto& outputs = result.get();
-  // The mutating op returns [k_cache, v_cache, attn_output] in a fixed order,
-  // so the attention output is always the last output. Selecting it by element
-  // count is unsafe: a mutated cache (numel Cmax*Hkv*D) can share the attention
-  // output's numel (S*Hq*D) whenever S*Hq == Cmax*Hkv (e.g. llama1b_prefill).
-  if (outputs.empty() || !outputs.back().isTensor()) {
+  // sdpa_with_kv_cache mutates k_cache/v_cache in place and returns the
+  // attention output. ExecuTorch emits program outputs as
+  // [*mutated_inputs, *user_outputs], so forward() returns exactly
+  // [k_cache, v_cache, attn_output]: three tensors, attention output last.
+  // Selecting by element count is unsafe: when S*Hq*D == Cmax*Hkv*D
+  // (e.g. llama1b_prefill, all 262144) the attention output and both caches
+  // share numel. Select by the documented position instead, and assert the
+  // output count and per-slot numels so a future change in output structure
+  // still fails loudly.
+  if (outputs.size() != 3) {
     printf(
-        "FAIL: missing attention output among %zu outputs\n", outputs.size());
+        "FAIL: expected 3 outputs [k_cache, v_cache, attn_output], got %zu\n",
+        outputs.size());
     return false;
   }
-  const auto& out_tensor = outputs.back().toTensor();
+  for (size_t i = 0; i < outputs.size(); i++) {
+    if (!outputs[i].isTensor()) {
+      printf("FAIL: output %zu is not a tensor\n", i);
+      return false;
+    }
+  }
+  // Outputs 0 and 1 are the mutated k/v caches (numel cn); output 2 is attn.
+  for (int i = 0; i < 2; i++) {
+    if (outputs[i].toTensor().numel() != cn) {
+      printf(
+          "FAIL: output %d (expected k/v cache) numel %zu != Cmax*Hkv*D %d\n",
+          i,
+          (size_t)outputs[i].toTensor().numel(),
+          cn);
+      return false;
+    }
+  }
+  const auto& out_tensor = outputs[2].toTensor();
   if (out_tensor.numel() != on) {
     printf(
-        "FAIL: attention output numel %d != expected %d\n",
-        (int)out_tensor.numel(),
+        "FAIL: attention output numel %zu != S*Hq*D %d\n",
+        (size_t)out_tensor.numel(),
         on);
     return false;
   }

From 87ab70644938fb2e580d45f89e730abfc87b0847 Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 15 Jun 2026 08:49:32 -0700
Subject: [PATCH 3/4] Make SDPA replay + dynamic-decode select attn output by
 position too

The replay and dynamic-input_pos decode tests had the same latent bug as the
sweep: they classified outputs by element count (ne==qn attn, ne==cn cache),
which is ambiguous when the attention output and caches share numel. Switch
both to the documented positional order [k_cache, v_cache, attn_output] and
assert 3 tensor outputs; the existing content-based k/v cache disambiguation
and cross-step threading are unchanged.
---
 backends/webgpu/test/test_webgpu_native.cpp | 74 +++++++++++++--------
 1 file changed, 46 insertions(+), 28 deletions(-)

diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index a42cc067f82..0c816ae1138 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -819,21 +819,30 @@ static bool test_sdpa_replay(const SdpaSequence& seq, const std::string& dir) {
 
     // The op returns [k_cache, v_cache, attn_output]: attn has a unique numel;
     // the two caches share numel cn, so identify them by content at step 0.
-    int attn_idx = -1;
-    std::vector<int> cache_idxs;
-    for (size_t i = 0; i < outs.size(); i++) {
-      if (!outs[i].isTensor()) {
-        continue;
-      }
-      const int ne = static_cast<int>(outs[i].toTensor().numel());
-      if (ne == qn) {
-        attn_idx = static_cast<int>(i);
-      } else if (ne == cn) {
-        cache_idxs.push_back(static_cast<int>(i));
-      }
+    // Outputs are [k_cache, v_cache, attn_output]: ExecuTorch emits
+    // [*mutated_inputs, *user_outputs], so the two mutated caches come first
+    // (signature order k, v) and the attention output last. Select by position;
+    // numel is ambiguous when the attn output and caches share numel. The k/v
+    // caches are still disambiguated by content at step 0 below.
+    if (outs.size() != 3 || !outs[0].isTensor() || !outs[1].isTensor() ||
+        !outs[2].isTensor()) {
+      printf(
+          "FAIL: %s step%zu: expected 3 tensor outputs "
+          "[k_cache, v_cache, attn_output], got %zu\n",
+          seq.name,
+          t,
+          outs.size());
+      return false;
     }
-    if (attn_idx < 0 || cache_idxs.size() != 2) {
-      printf("FAIL: %s step%zu: expected 1 attn + 2 caches\n", seq.name, t);
+    const int attn_idx = 2;
+    const std::vector<int> cache_idxs = {0, 1};
+    if (static_cast<int>(outs[attn_idx].toTensor().numel()) != qn) {
+      printf(
+          "FAIL: %s step%zu: attn output numel %zu != expected %d\n",
+          seq.name,
+          t,
+          (size_t)outs[attn_idx].toTensor().numel(),
+          qn);
       return false;
     }
 
@@ -988,21 +997,30 @@ static bool test_sdpa_dynamic_decode(
     }
     const auto& outs = result.get();
 
-    int attn_idx = -1;
-    std::vector<int> cache_idxs;
-    for (size_t i = 0; i < outs.size(); i++) {
-      if (!outs[i].isTensor()) {
-        continue;
-      }
-      const int ne = static_cast<int>(outs[i].toTensor().numel());
-      if (ne == qn) {
-        attn_idx = static_cast<int>(i);
-      } else if (ne == cn) {
-        cache_idxs.push_back(static_cast<int>(i));
-      }
+    // Outputs are [k_cache, v_cache, attn_output]: ExecuTorch emits
+    // [*mutated_inputs, *user_outputs], so the two mutated caches come first
+    // (signature order k, v) and the attention output last. Select by position;
+    // numel is ambiguous when the attn output and caches share numel. The k/v
+    // caches are still disambiguated by content at step 0 below.
+    if (outs.size() != 3 || !outs[0].isTensor() || !outs[1].isTensor() ||
+        !outs[2].isTensor()) {
+      printf(
+          "FAIL: %s step%d: expected 3 tensor outputs "
+          "[k_cache, v_cache, attn_output], got %zu\n",
+          seq.name,
+          t,
+          outs.size());
+      return false;
     }
-    if (attn_idx < 0 || cache_idxs.size() != 2) {
-      printf("FAIL: %s step%d: expected 1 attn + 2 caches\n", seq.name, t);
+    const int attn_idx = 2;
+    const std::vector<int> cache_idxs = {0, 1};
+    if (static_cast<int>(outs[attn_idx].toTensor().numel()) != qn) {
+      printf(
+          "FAIL: %s step%d: attn output numel %zu != expected %d\n",
+          seq.name,
+          t,
+          (size_t)outs[attn_idx].toTensor().numel(),
+          qn);
       return false;
     }
     if (t == 0) {

From f7c3da722d1bfe7c601f8e1996ca764c76121cca Mon Sep 17 00:00:00 2001
From: Anthony Shoumikhin <anthony@shoumikh.in>
Date: Mon, 15 Jun 2026 08:55:26 -0700
Subject: [PATCH 4/4] Address review: drop stale replay comment + assert KV
 cache numels

- Remove the leftover replay comment claiming the attn output has a unique
  numel (the whole point of this PR is that it can equal the cache numel).
- In both replay and dynamic-decode, after positional selection, assert the
  two cache outputs have numel cn before step-0 identification/threading read
  cn/kvn elements from them, turning a short/changed tensor into a clean
  failure instead of an out-of-bounds read.
---
 backends/webgpu/test/test_webgpu_native.cpp | 32 +++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index 0c816ae1138..fea5b591a7e 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -817,8 +817,6 @@ static bool test_sdpa_replay(const SdpaSequence& seq, const std::string& dir) {
     }
     const auto& outs = result.get();
 
-    // The op returns [k_cache, v_cache, attn_output]: attn has a unique numel;
-    // the two caches share numel cn, so identify them by content at step 0.
     // Outputs are [k_cache, v_cache, attn_output]: ExecuTorch emits
     // [*mutated_inputs, *user_outputs], so the two mutated caches come first
     // (signature order k, v) and the attention output last. Select by position;
@@ -845,6 +843,21 @@ static bool test_sdpa_replay(const SdpaSequence& seq, const std::string& dir) {
           qn);
       return false;
     }
+    // Caches must be full-size (numel cn): step-0 identification and cross-step
+    // threading read cn/kvn elements from them, so a short tensor would be an
+    // out-of-bounds read rather than a clean failure.
+    for (int ci : cache_idxs) {
+      if (static_cast<int>(outs[ci].toTensor().numel()) != cn) {
+        printf(
+            "FAIL: %s step%zu: cache output %d numel %zu != Cmax*Hkv*D %d\n",
+            seq.name,
+            t,
+            ci,
+            (size_t)outs[ci].toTensor().numel(),
+            cn);
+        return false;
+      }
+    }
 
     if (t == 0) {
       const float* c0 = outs[cache_idxs[0]].toTensor().const_data_ptr<float>();
@@ -1023,6 +1036,21 @@ static bool test_sdpa_dynamic_decode(
           qn);
       return false;
     }
+    // Caches must be full-size (numel cn): step-0 identification and cross-step
+    // threading read cn/kvn elements from them, so a short tensor would be an
+    // out-of-bounds read rather than a clean failure.
+    for (int ci : cache_idxs) {
+      if (static_cast<int>(outs[ci].toTensor().numel()) != cn) {
+        printf(
+            "FAIL: %s step%d: cache output %d numel %zu != Cmax*Hkv*D %d\n",
+            seq.name,
+            t,
+            ci,
+            (size_t)outs[ci].toTensor().numel(),
+            cn);
+        return false;
+      }
+    }
     if (t == 0) {
       const float* c0 = outs[cache_idxs[0]].toTensor().const_data_ptr<float>();
       const float* c1 = outs[cache_idxs[1]].toTensor().const_data_ptr<float>();