From af484ca71cc5c7fe65a670758ad6455d70dc5564 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 15 Jun 2026 07:54:15 -0700 Subject: [PATCH 1/4] Fix webgpu native SDPA test: select attention output by position, not numel The sdpa_with_kv_cache sweep picked the attention output among the mutating op's outputs by matching element count. For llama1b_prefill (Hq=32,Hkv=8,D=64,S=128,Cmax=512) the attention output (S*Hq*D=262144) shares its numel with both the k and v caches (Cmax*Hkv*D=262144, since S*Hq==Cmax*Hkv), so three tensors matched and the test failed with 'ambiguous attention output'. The op returns [k_cache, v_cache, attn_output] in a fixed order, so select the last output instead and keep a numel sanity check. Test-only; the kernel is correct. --- backends/webgpu/test/test_webgpu_native.cpp | 29 +++++++-------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index 338ecb39913..78ee955dbd9 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -662,32 +662,23 @@ static bool test_sdpa_config( } const auto& outputs = result.get(); - // The mutating op returns [k_cache, v_cache, attn_output]; select the - // attention output (numel == S*Hq*D), not a mutated cache (numel Cmax*Hkv*D). - // Count matches and fail if ambiguous: a cache could share the same numel. - int attn_idx = -1; - int attn_matches = 0; - for (size_t i = 0; i < outputs.size(); i++) { - if (outputs[i].isTensor() && outputs[i].toTensor().numel() == on) { - attn_idx = static_cast(i); - attn_matches++; - } - } - if (attn_idx < 0) { + // The mutating op returns [k_cache, v_cache, attn_output] in a fixed order, + // so the attention output is always the last output. Selecting it by element + // count is unsafe: a mutated cache (numel Cmax*Hkv*D) can share the attention + // output's numel (S*Hq*D) whenever S*Hq == Cmax*Hkv (e.g. llama1b_prefill). + if (outputs.empty() || !outputs.back().isTensor()) { printf( - "FAIL: no attention output (numel %d) among %zu outputs\n", - on, - outputs.size()); + "FAIL: missing attention output among %zu outputs\n", outputs.size()); return false; } - if (attn_matches > 1) { + const auto& out_tensor = outputs.back().toTensor(); + if (out_tensor.numel() != on) { printf( - "FAIL: ambiguous attention output: %d tensors match numel %d\n", - attn_matches, + "FAIL: attention output numel %d != expected %d\n", + (int)out_tensor.numel(), on); return false; } - const auto& out_tensor = outputs[attn_idx].toTensor(); const float* out_data = out_tensor.const_data_ptr(); std::vector golden = load_golden(golden_path, on); From 71200facc1112995f83b05e689aec0cab47e9d93 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 15 Jun 2026 08:06:17 -0700 Subject: [PATCH 2/4] Address review: select attn output by position with count + per-slot numel asserts Use the documented ExecuTorch output order [*mutated_inputs, *user_outputs] = [k_cache, v_cache, attn_output] and select the attention output by position (index 2) instead of by element count. Assert outputs.size()==3, that each output is a tensor, and that the two cache slots have numel Cmax*Hkv*D, so a future change in output arity/order fails loudly. Print numel as %zu/size_t to match the rest of the file and avoid truncation. Addresses Copilot review comments. --- backends/webgpu/test/test_webgpu_native.cpp | 41 ++++++++++++++++----- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index 78ee955dbd9..a42cc067f82 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -662,20 +662,43 @@ static bool test_sdpa_config( } const auto& outputs = result.get(); - // The mutating op returns [k_cache, v_cache, attn_output] in a fixed order, - // so the attention output is always the last output. Selecting it by element - // count is unsafe: a mutated cache (numel Cmax*Hkv*D) can share the attention - // output's numel (S*Hq*D) whenever S*Hq == Cmax*Hkv (e.g. llama1b_prefill). - if (outputs.empty() || !outputs.back().isTensor()) { + // sdpa_with_kv_cache mutates k_cache/v_cache in place and returns the + // attention output. ExecuTorch emits program outputs as + // [*mutated_inputs, *user_outputs], so forward() returns exactly + // [k_cache, v_cache, attn_output]: three tensors, attention output last. + // Selecting by element count is unsafe: when S*Hq*D == Cmax*Hkv*D + // (e.g. llama1b_prefill, all 262144) the attention output and both caches + // share numel. Select by the documented position instead, and assert the + // output count and per-slot numels so a future change in output structure + // still fails loudly. + if (outputs.size() != 3) { printf( - "FAIL: missing attention output among %zu outputs\n", outputs.size()); + "FAIL: expected 3 outputs [k_cache, v_cache, attn_output], got %zu\n", + outputs.size()); return false; } - const auto& out_tensor = outputs.back().toTensor(); + for (size_t i = 0; i < outputs.size(); i++) { + if (!outputs[i].isTensor()) { + printf("FAIL: output %zu is not a tensor\n", i); + return false; + } + } + // Outputs 0 and 1 are the mutated k/v caches (numel cn); output 2 is attn. + for (int i = 0; i < 2; i++) { + if (outputs[i].toTensor().numel() != cn) { + printf( + "FAIL: output %d (expected k/v cache) numel %zu != Cmax*Hkv*D %d\n", + i, + (size_t)outputs[i].toTensor().numel(), + cn); + return false; + } + } + const auto& out_tensor = outputs[2].toTensor(); if (out_tensor.numel() != on) { printf( - "FAIL: attention output numel %d != expected %d\n", - (int)out_tensor.numel(), + "FAIL: attention output numel %zu != S*Hq*D %d\n", + (size_t)out_tensor.numel(), on); return false; } From 87ab70644938fb2e580d45f89e730abfc87b0847 Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 15 Jun 2026 08:49:32 -0700 Subject: [PATCH 3/4] Make SDPA replay + dynamic-decode select attn output by position too The replay and dynamic-input_pos decode tests had the same latent bug as the sweep: they classified outputs by element count (ne==qn attn, ne==cn cache), which is ambiguous when the attention output and caches share numel. Switch both to the documented positional order [k_cache, v_cache, attn_output] and assert 3 tensor outputs; the existing content-based k/v cache disambiguation and cross-step threading are unchanged. --- backends/webgpu/test/test_webgpu_native.cpp | 74 +++++++++++++-------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index a42cc067f82..0c816ae1138 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -819,21 +819,30 @@ static bool test_sdpa_replay(const SdpaSequence& seq, const std::string& dir) { // The op returns [k_cache, v_cache, attn_output]: attn has a unique numel; // the two caches share numel cn, so identify them by content at step 0. - int attn_idx = -1; - std::vector cache_idxs; - for (size_t i = 0; i < outs.size(); i++) { - if (!outs[i].isTensor()) { - continue; - } - const int ne = static_cast(outs[i].toTensor().numel()); - if (ne == qn) { - attn_idx = static_cast(i); - } else if (ne == cn) { - cache_idxs.push_back(static_cast(i)); - } + // Outputs are [k_cache, v_cache, attn_output]: ExecuTorch emits + // [*mutated_inputs, *user_outputs], so the two mutated caches come first + // (signature order k, v) and the attention output last. Select by position; + // numel is ambiguous when the attn output and caches share numel. The k/v + // caches are still disambiguated by content at step 0 below. + if (outs.size() != 3 || !outs[0].isTensor() || !outs[1].isTensor() || + !outs[2].isTensor()) { + printf( + "FAIL: %s step%zu: expected 3 tensor outputs " + "[k_cache, v_cache, attn_output], got %zu\n", + seq.name, + t, + outs.size()); + return false; } - if (attn_idx < 0 || cache_idxs.size() != 2) { - printf("FAIL: %s step%zu: expected 1 attn + 2 caches\n", seq.name, t); + const int attn_idx = 2; + const std::vector cache_idxs = {0, 1}; + if (static_cast(outs[attn_idx].toTensor().numel()) != qn) { + printf( + "FAIL: %s step%zu: attn output numel %zu != expected %d\n", + seq.name, + t, + (size_t)outs[attn_idx].toTensor().numel(), + qn); return false; } @@ -988,21 +997,30 @@ static bool test_sdpa_dynamic_decode( } const auto& outs = result.get(); - int attn_idx = -1; - std::vector cache_idxs; - for (size_t i = 0; i < outs.size(); i++) { - if (!outs[i].isTensor()) { - continue; - } - const int ne = static_cast(outs[i].toTensor().numel()); - if (ne == qn) { - attn_idx = static_cast(i); - } else if (ne == cn) { - cache_idxs.push_back(static_cast(i)); - } + // Outputs are [k_cache, v_cache, attn_output]: ExecuTorch emits + // [*mutated_inputs, *user_outputs], so the two mutated caches come first + // (signature order k, v) and the attention output last. Select by position; + // numel is ambiguous when the attn output and caches share numel. The k/v + // caches are still disambiguated by content at step 0 below. + if (outs.size() != 3 || !outs[0].isTensor() || !outs[1].isTensor() || + !outs[2].isTensor()) { + printf( + "FAIL: %s step%d: expected 3 tensor outputs " + "[k_cache, v_cache, attn_output], got %zu\n", + seq.name, + t, + outs.size()); + return false; } - if (attn_idx < 0 || cache_idxs.size() != 2) { - printf("FAIL: %s step%d: expected 1 attn + 2 caches\n", seq.name, t); + const int attn_idx = 2; + const std::vector cache_idxs = {0, 1}; + if (static_cast(outs[attn_idx].toTensor().numel()) != qn) { + printf( + "FAIL: %s step%d: attn output numel %zu != expected %d\n", + seq.name, + t, + (size_t)outs[attn_idx].toTensor().numel(), + qn); return false; } if (t == 0) { From f7c3da722d1bfe7c601f8e1996ca764c76121cca Mon Sep 17 00:00:00 2001 From: Anthony Shoumikhin Date: Mon, 15 Jun 2026 08:55:26 -0700 Subject: [PATCH 4/4] Address review: drop stale replay comment + assert KV cache numels - Remove the leftover replay comment claiming the attn output has a unique numel (the whole point of this PR is that it can equal the cache numel). - In both replay and dynamic-decode, after positional selection, assert the two cache outputs have numel cn before step-0 identification/threading read cn/kvn elements from them, turning a short/changed tensor into a clean failure instead of an out-of-bounds read. --- backends/webgpu/test/test_webgpu_native.cpp | 32 +++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index 0c816ae1138..fea5b591a7e 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -817,8 +817,6 @@ static bool test_sdpa_replay(const SdpaSequence& seq, const std::string& dir) { } const auto& outs = result.get(); - // The op returns [k_cache, v_cache, attn_output]: attn has a unique numel; - // the two caches share numel cn, so identify them by content at step 0. // Outputs are [k_cache, v_cache, attn_output]: ExecuTorch emits // [*mutated_inputs, *user_outputs], so the two mutated caches come first // (signature order k, v) and the attention output last. Select by position; @@ -845,6 +843,21 @@ static bool test_sdpa_replay(const SdpaSequence& seq, const std::string& dir) { qn); return false; } + // Caches must be full-size (numel cn): step-0 identification and cross-step + // threading read cn/kvn elements from them, so a short tensor would be an + // out-of-bounds read rather than a clean failure. + for (int ci : cache_idxs) { + if (static_cast(outs[ci].toTensor().numel()) != cn) { + printf( + "FAIL: %s step%zu: cache output %d numel %zu != Cmax*Hkv*D %d\n", + seq.name, + t, + ci, + (size_t)outs[ci].toTensor().numel(), + cn); + return false; + } + } if (t == 0) { const float* c0 = outs[cache_idxs[0]].toTensor().const_data_ptr(); @@ -1023,6 +1036,21 @@ static bool test_sdpa_dynamic_decode( qn); return false; } + // Caches must be full-size (numel cn): step-0 identification and cross-step + // threading read cn/kvn elements from them, so a short tensor would be an + // out-of-bounds read rather than a clean failure. + for (int ci : cache_idxs) { + if (static_cast(outs[ci].toTensor().numel()) != cn) { + printf( + "FAIL: %s step%d: cache output %d numel %zu != Cmax*Hkv*D %d\n", + seq.name, + t, + ci, + (size_t)outs[ci].toTensor().numel(), + cn); + return false; + } + } if (t == 0) { const float* c0 = outs[cache_idxs[0]].toTensor().const_data_ptr(); const float* c1 = outs[cache_idxs[1]].toTensor().const_data_ptr();