From 772fde70e73a152bcac4c0de1ab60dceaa4a0bc0 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Wed, 10 Jun 2026 13:19:20 -0700
Subject: [PATCH 1/3] Update

[ghstack-poisoned]
---
 backends/webgpu/CMakeLists.txt              |   1 +
 backends/webgpu/runtime/WebGPUDevice.cpp    |  11 +
 backends/webgpu/runtime/WebGPUDevice.h      |   8 +
 backends/webgpu/runtime/WebGPUGraph.cpp     |  56 ++++-
 backends/webgpu/runtime/WebGPUGraph.h       |   1 +
 backends/webgpu/runtime/WebGPUQueryPool.cpp | 220 ++++++++++++++++++++
 backends/webgpu/runtime/WebGPUQueryPool.h   |  84 ++++++++
 7 files changed, 380 insertions(+), 1 deletion(-)
 create mode 100644 backends/webgpu/runtime/WebGPUQueryPool.cpp
 create mode 100644 backends/webgpu/runtime/WebGPUQueryPool.h
diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 9b1476f2290..9847763dba5 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -30,6 +30,7 @@ set(WEBGPU_SRCS
     runtime/WebGPUGraph.cpp
     runtime/WebGPUDelegateHeader.cpp
     runtime/WebGPUDevice.cpp
+    runtime/WebGPUQueryPool.cpp
     runtime/ops/OperatorRegistry.cpp
     runtime/ops/add/BinaryOp.cpp
     runtime/ops/rms_norm/RmsNorm.cpp
diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
index 041cbe5a703..6672459320d 100644
--- a/backends/webgpu/runtime/WebGPUDevice.cpp
+++ b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -13,6 +13,7 @@
 #include <cstdlib>
 #include <memory>
 #include <stdexcept>
+#include <vector>
 
 namespace executorch {
 namespace backends {
@@ -137,6 +138,16 @@ WebGPUContext create_webgpu_context() {
       WGPUStatus_Success) {
     device_desc.requiredLimits = &supported_limits;
   }
+
+  // Bench: enable TimestampQuery if available; fail-open (skip timing if not).
+  std::vector<WGPUFeatureName> required_features;
+  if (wgpuAdapterHasFeature(ctx.adapter, WGPUFeatureName_TimestampQuery)) {
+    required_features.push_back(WGPUFeatureName_TimestampQuery);
+    device_desc.requiredFeatureCount = required_features.size();
+    device_desc.requiredFeatures = required_features.data();
+    ctx.timestamp_supported = true;
+  }
+
   device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
 
   WGPUWaitStatus device_wait = webgpu_wait(
diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h
index 78afd96316a..90100fa831a 100644
--- a/backends/webgpu/runtime/WebGPUDevice.h
+++ b/backends/webgpu/runtime/WebGPUDevice.h
@@ -10,6 +10,10 @@
 
 #include <webgpu/webgpu.h>
 
+#include <executorch/backends/webgpu/runtime/WebGPUQueryPool.h>
+
+#include <memory>
+
 namespace executorch {
 namespace backends {
 namespace webgpu {
@@ -19,6 +23,10 @@ struct WebGPUContext {
   WGPUAdapter adapter = nullptr;
   WGPUDevice device = nullptr;
   WGPUQueue queue = nullptr;
+  // True if the device was created with the TimestampQuery feature (bench).
+  bool timestamp_supported = false;
+  // Bench-only: timestamp-query pool, lazily created in execute() (env-gated).
+  std::unique_ptr<WebGPUQueryPool> querypool;
 };
 
 WebGPUContext create_webgpu_context();
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index b3ae5511d13..410df4d89dc 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -15,6 +15,7 @@
 #include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
 #include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
 
+#include <cstdlib>
 #include <cstring>
 #include <stdexcept>
 
@@ -496,18 +497,48 @@ void WebGPUGraph::copy_inputs(
   }
 }
 
+namespace {
+// Bench gate: WEBGPU_TIMESTAMP_QUERY enables per-pass GPU timestamp queries.
+bool should_timestamp_query() {
+  static const bool enabled = std::getenv("WEBGPU_TIMESTAMP_QUERY") != nullptr;
+  return enabled;
+}
+} // namespace
+
 void WebGPUGraph::execute() {
   const size_t n = dispatches_.size();
   const size_t chunk = execute_config_.chunk_size;
 
   if (chunk == 0 || n <= chunk) {
+    // Bench: timestamp-query pool, null unless env-gated + feature present.
+    WebGPUQueryPool* qp = nullptr;
+    if (should_timestamp_query() && n > 0) {
+      if (auto* ctx = get_default_webgpu_context()) {
+        if (ctx->timestamp_supported) {
+          if (!ctx->querypool || ctx->querypool->capacity() < n) {
+            ctx->querypool = std::make_unique<WebGPUQueryPool>();
+            ctx->querypool->initialize(device_, static_cast<uint32_t>(n));
+          }
+          qp = ctx->querypool.get();
+          qp->reset(static_cast<uint32_t>(n));
+        }
+      }
+    }
+
     WGPUCommandEncoderDescriptor enc_desc = {};
     WGPUCommandEncoder encoder =
         wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
 
     // One pass per dispatch: enforces storage RAW ordering across deps.
-    for (const auto& dispatch : dispatches_) {
+    for (size_t i = 0; i < n; i++) {
+      const auto& dispatch = dispatches_[i];
+      // tw must outlive BeginComputePass (the descriptor points at it).
+      WGPUPassTimestampWrites tw = {};
       WGPUComputePassDescriptor pass_desc = {};
+      if (qp) {
+        tw = qp->writes_for(static_cast<uint32_t>(i));
+        pass_desc.timestampWrites = &tw;
+      }
       WGPUComputePassEncoder pass =
           wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
       wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline);
@@ -517,6 +548,13 @@ void WebGPUGraph::execute() {
           pass, dispatch.workgroup_count_x, 1, 1);
       wgpuComputePassEncoderEnd(pass);
       wgpuComputePassEncoderRelease(pass);
+      if (qp) {
+        qp->record(
+            static_cast<uint32_t>(i),
+            dispatch.kernel_name,
+            {dispatch.workgroup_count_x, 1, 1},
+            {1, 1, 1});
+      }
     }
 
     for (const auto& copy : output_copies_) {
@@ -524,15 +562,31 @@ void WebGPUGraph::execute() {
           encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes);
     }
 
+    if (qp) {
+      qp->resolve(encoder);
+    }
+
     WGPUCommandBufferDescriptor cmd_desc = {};
     WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
     wgpuQueueSubmit(queue_, 1, &cmd);
 
     wgpuCommandBufferRelease(cmd);
     wgpuCommandEncoderRelease(encoder);
+
+    if (qp) {
+      qp->extract_results(instance_);
+      qp->print_results();
+    }
     return;
   }
 
+  // GPU timestamp queries assume one submit; chunked execute is multi-submit.
+  if (should_timestamp_query()) {
+    throw std::runtime_error(
+        "WebGPU: WEBGPU_TIMESTAMP_QUERY is incompatible with chunked execute "
+        "(multi-submit); disable chunking to use GPU timestamp queries");
+  }
+
   const size_t first_chunk = execute_config_.initial_chunk_size > 0
       ? execute_config_.initial_chunk_size
       : chunk;
diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
index 9f656ce4d14..92aa14d59b6 100644
--- a/backends/webgpu/runtime/WebGPUGraph.h
+++ b/backends/webgpu/runtime/WebGPUGraph.h
@@ -31,6 +31,7 @@ struct WebGPUDispatch {
   WGPUComputePipeline pipeline = nullptr;
   WGPUBindGroup bind_group = nullptr;
   uint32_t workgroup_count_x = 1;
+  std::string kernel_name; // bench label
 };
 
 struct OutputCopy {
diff --git a/backends/webgpu/runtime/WebGPUQueryPool.cpp b/backends/webgpu/runtime/WebGPUQueryPool.cpp
new file mode 100644
index 00000000000..0e5c583337c
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUQueryPool.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUCompat.h>
+#include <executorch/backends/webgpu/runtime/WebGPUQueryPool.h>
+
+#include <cstdio>
+#include <map>
+#include <stdexcept>
+#include <string>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+struct MapCallbackData {
+  WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error;
+};
+
+void map_callback(
+    WGPUMapAsyncStatus status,
+    WGPUStringView /*message*/,
+    void* userdata1,
+    void* /*userdata2*/) {
+  auto* data = static_cast<MapCallbackData*>(userdata1);
+  data->status = status;
+}
+
+constexpr uint64_t kTimestampBytes = sizeof(uint64_t);
+
+} // namespace
+
+WebGPUQueryPool::~WebGPUQueryPool() {
+  if (readback_buf_) {
+    wgpuBufferRelease(readback_buf_);
+  }
+  if (resolve_buf_) {
+    wgpuBufferRelease(resolve_buf_);
+  }
+  if (qset_) {
+    wgpuQuerySetRelease(qset_);
+  }
+}
+
+void WebGPUQueryPool::initialize(WGPUDevice device, uint32_t max_pairs) {
+  if (max_pairs == 0) {
+    return;
+  }
+  // Re-init guard; mirrors Vulkan QueryPool (avoids leaking a prior QuerySet).
+  if (qset_ != nullptr) {
+    return;
+  }
+  capacity_pairs_ = max_pairs;
+  const uint32_t count = 2 * max_pairs;
+  const uint64_t bytes = static_cast<uint64_t>(count) * kTimestampBytes;
+
+  WGPUQuerySetDescriptor qsd = {};
+  qsd.type = WGPUQueryType_Timestamp;
+  qsd.count = count;
+  qset_ = wgpuDeviceCreateQuerySet(device, &qsd);
+
+  WGPUBufferDescriptor rbd = {};
+  rbd.size = bytes;
+  rbd.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc;
+  resolve_buf_ = wgpuDeviceCreateBuffer(device, &rbd);
+
+  WGPUBufferDescriptor mbd = {};
+  mbd.size = bytes;
+  mbd.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst;
+  readback_buf_ = wgpuDeviceCreateBuffer(device, &mbd);
+  // WebGPU timestamps are already nanoseconds, so ns_per_tick_ stays 1.0.
+}
+
+void WebGPUQueryPool::reset(uint32_t num_dispatches) {
+  // Fail loud on overrun; mirrors Vulkan QueryPool VK_CHECK_COND guard.
+  if (num_dispatches > capacity_pairs_) {
+    throw std::runtime_error(
+        "WebGPUQueryPool: num_dispatches " + std::to_string(num_dispatches) +
+        " exceeds capacity " + std::to_string(capacity_pairs_));
+  }
+  num_pairs_ = num_dispatches;
+  durations_.clear();
+}
+
+WGPUPassTimestampWrites WebGPUQueryPool::writes_for(uint32_t i) {
+  WGPUPassTimestampWrites tw = {};
+  tw.querySet = qset_;
+  tw.beginningOfPassWriteIndex = 2 * i;
+  tw.endOfPassWriteIndex = 2 * i + 1;
+  return tw;
+}
+
+void WebGPUQueryPool::record(
+    uint32_t i,
+    const std::string& name,
+    std::array<uint32_t, 3> gwg,
+    std::array<uint32_t, 3> lwg) {
+  ShaderDuration d;
+  d.idx = i;
+  d.kernel_name = name;
+  d.global_wg = gwg;
+  d.local_wg = lwg;
+  durations_.push_back(d);
+}
+
+void WebGPUQueryPool::resolve(WGPUCommandEncoder encoder) {
+  if (num_pairs_ == 0) {
+    return;
+  }
+  const uint32_t count = 2 * num_pairs_;
+  wgpuCommandEncoderResolveQuerySet(encoder, qset_, 0, count, resolve_buf_, 0);
+  wgpuCommandEncoderCopyBufferToBuffer(
+      encoder,
+      resolve_buf_,
+      0,
+      readback_buf_,
+      0,
+      static_cast<uint64_t>(count) * kTimestampBytes);
+}
+
+void WebGPUQueryPool::extract_results(WGPUInstance instance) {
+  if (num_pairs_ == 0) {
+    return;
+  }
+  const uint32_t count = 2 * num_pairs_;
+  const uint64_t bytes = static_cast<uint64_t>(count) * kTimestampBytes;
+
+  MapCallbackData cb;
+  WGPUBufferMapCallbackInfo cb_info = {};
+  cb_info.mode = WGPUCallbackMode_WaitAnyOnly;
+  cb_info.callback = map_callback;
+  cb_info.userdata1 = &cb;
+  webgpu_wait(
+      instance,
+      wgpuBufferMapAsync(readback_buf_, WGPUMapMode_Read, 0, bytes, cb_info));
+
+  if (cb.status != WGPUMapAsyncStatus_Success) {
+    printf(
+        "WebGPUQueryPool: readback map failed (status %d)\n", (int)cb.status);
+    return;
+  }
+  const uint64_t* ticks = static_cast<const uint64_t*>(
+      wgpuBufferGetConstMappedRange(readback_buf_, 0, bytes));
+  if (ticks != nullptr) {
+    for (auto& d : durations_) {
+      const uint64_t t0 = ticks[2 * d.idx];
+      const uint64_t t1 = ticks[2 * d.idx + 1];
+      d.start_time_ns = static_cast<uint64_t>(t0 * ns_per_tick_);
+      d.end_time_ns = static_cast<uint64_t>(t1 * ns_per_tick_);
+      d.execution_duration_ns =
+          (t1 >= t0) ? static_cast<uint64_t>((t1 - t0) * ns_per_tick_) : 0;
+    }
+  }
+  wgpuBufferUnmap(readback_buf_);
+}
+
+void WebGPUQueryPool::print_results(bool tsv) const {
+  const char* sep = tsv ? "\t" : "  ";
+  if (tsv) {
+    printf("idx%skernel%sgwg%sduration_us\n", sep, sep, sep);
+  } else {
+    printf("=== WebGPUQueryPool: per-dispatch GPU time ===\n");
+  }
+  for (const auto& d : durations_) {
+    const double us = d.execution_duration_ns / 1000.0;
+    printf(
+        "%u%s%s%s(%u,%u,%u)%s%.3f\n",
+        d.idx,
+        sep,
+        d.kernel_name.empty() ? "dispatch" : d.kernel_name.c_str(),
+        sep,
+        d.global_wg[0],
+        d.global_wg[1],
+        d.global_wg[2],
+        sep,
+        us);
+  }
+  if (tsv) {
+    return;
+  }
+  std::map<std::string, std::pair<uint64_t, uint32_t>> totals;
+  for (const auto& d : durations_) {
+    auto& t = totals[d.kernel_name.empty() ? "dispatch" : d.kernel_name];
+    t.first += d.execution_duration_ns;
+    t.second += 1;
+  }
+  printf("--- per-kernel mean / total (us) ---\n");
+  for (const auto& kv : totals) {
+    const double mean_us = kv.second.first / kv.second.second / 1000.0;
+    const double total_us = kv.second.first / 1000.0;
+    printf(
+        "%s%smean %.3f%stotal %.3f (n=%u)\n",
+        kv.first.c_str(),
+        sep,
+        mean_us,
+        sep,
+        total_us,
+        kv.second.second);
+  }
+}
+
+uint64_t WebGPUQueryPool::get_mean_shader_ns(
+    const std::string& kernel_name) const {
+  uint64_t sum = 0;
+  uint32_t n = 0;
+  for (const auto& d : durations_) {
+    if (d.kernel_name == kernel_name) {
+      sum += d.execution_duration_ns;
+      n += 1;
+    }
+  }
+  return n == 0 ? 0 : sum / n;
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/WebGPUQueryPool.h b/backends/webgpu/runtime/WebGPUQueryPool.h
new file mode 100644
index 00000000000..ca0dd67e0a4
--- /dev/null
+++ b/backends/webgpu/runtime/WebGPUQueryPool.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <webgpu/webgpu.h>
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace executorch::backends::webgpu {
+
+// Per-dispatch GPU timing; mirrors Vulkan QueryPool ShaderDuration.
+struct ShaderDuration {
+  uint32_t idx = 0;
+  std::string kernel_name;
+  std::array<uint32_t, 3> global_wg{};
+  std::array<uint32_t, 3> local_wg{};
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint64_t execution_duration_ns = 0;
+};
+
+// GPU timestamp-query pool; re-port of Vulkan vk_api/QueryPool.
+class WebGPUQueryPool {
+ public:
+  WebGPUQueryPool() = default;
+  ~WebGPUQueryPool();
+
+  WebGPUQueryPool(const WebGPUQueryPool&) = delete;
+  WebGPUQueryPool& operator=(const WebGPUQueryPool&) = delete;
+
+  // Create the QuerySet + readback buffers; query the ns-per-tick period.
+  void initialize(WGPUDevice device, uint32_t max_pairs);
+  bool is_initialized() const {
+    return qset_ != nullptr;
+  }
+  uint32_t capacity() const {
+    return capacity_pairs_;
+  }
+
+  // Clear durations and set the dispatch count for this run.
+  void reset(uint32_t num_dispatches);
+
+  // timestampWrites for pass i: begin=2i, end=2i+1.
+  WGPUPassTimestampWrites writes_for(uint32_t i);
+
+  // Record pass i's label + workgroup sizes (start/end filled by extract).
+  void record(
+      uint32_t i,
+      const std::string& name,
+      std::array<uint32_t, 3> gwg,
+      std::array<uint32_t, 3> lwg);
+
+  // Resolve the QuerySet into the readback buffer; call before submit.
+  void resolve(WGPUCommandEncoder encoder);
+
+  // Map the readback, convert ticks->ns, fill durations; call after submit.
+  void extract_results(WGPUInstance instance);
+
+  const std::vector<ShaderDuration>& results() const {
+    return durations_;
+  }
+  void print_results(bool tsv = false) const;
+  uint64_t get_mean_shader_ns(const std::string& kernel_name) const;
+
+ private:
+  WGPUQuerySet qset_ = nullptr;
+  WGPUBuffer resolve_buf_ = nullptr; // QueryResolve | CopySrc
+  WGPUBuffer readback_buf_ = nullptr; // MapRead | CopyDst
+  uint32_t capacity_pairs_ = 0;
+  uint32_t num_pairs_ = 0;
+  double ns_per_tick_ = 1.0; // WebGPU timestamps are already nanoseconds
+  std::vector<ShaderDuration> durations_;
+};
+
+} // namespace executorch::backends::webgpu

From 66e8fa48ead991d51822c7e5c59576c79f262e22 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Fri, 12 Jun 2026 13:31:34 -0700
Subject: [PATCH 2/3] Update

[ghstack-poisoned]
---
 backends/webgpu/CMakeLists.txt              | 14 ++++++++++++++
 backends/webgpu/runtime/WebGPUDevice.cpp    |  8 ++++++++
 backends/webgpu/runtime/WebGPUDevice.h      |  4 ++++
 backends/webgpu/runtime/WebGPUGraph.cpp     | 19 +++++++++++++++++--
 backends/webgpu/runtime/WebGPUQueryPool.cpp |  4 ++++
 backends/webgpu/runtime/WebGPUQueryPool.h   |  4 ++++
 backends/webgpu/test/test_webgpu_native.cpp |  7 ++++++-
 7 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index 9847763dba5..da00858ab92 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -77,6 +77,15 @@ endif()
 
 target_compile_options(webgpu_backend PRIVATE -fexceptions)
 
+# Opt-in GPU timestamp profiling (WebGPUQueryPool); OFF so production builds
+# request no TimestampQuery device feature. Mirrors Vulkan's compile-flag gate.
+option(EXECUTORCH_BUILD_WEBGPU_PROFILING
+       "Enable WebGPU GPU timestamp-query profiling" OFF
+)
+if(EXECUTORCH_BUILD_WEBGPU_PROFILING)
+  target_compile_definitions(webgpu_backend PRIVATE WGPU_BACKEND_ENABLE_PROFILING)
+endif()
+
 # Link with --whole-archive for static registration of backend + ops
 executorch_target_link_options_shared_lib(webgpu_backend)
 
@@ -115,6 +124,11 @@ function(add_webgpu_native_test test_name test_src)
     target_link_libraries(${test_name} PRIVATE dl m pthread)
   endif()
   target_compile_options(${test_name} PRIVATE -fexceptions)
+  if(EXECUTORCH_BUILD_WEBGPU_PROFILING)
+    target_compile_definitions(
+      ${test_name} PRIVATE WGPU_BACKEND_ENABLE_PROFILING
+    )
+  endif()
   set_property(TARGET ${test_name} PROPERTY CXX_STANDARD 17)
 endfunction()
 
diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp
index 6672459320d..e69101851a2 100644
--- a/backends/webgpu/runtime/WebGPUDevice.cpp
+++ b/backends/webgpu/runtime/WebGPUDevice.cpp
@@ -13,7 +13,9 @@
 #include <cstdlib>
 #include <memory>
 #include <stdexcept>
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
 #include <vector>
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 
 namespace executorch {
 namespace backends {
@@ -139,6 +141,7 @@ WebGPUContext create_webgpu_context() {
     device_desc.requiredLimits = &supported_limits;
   }
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
   // Bench: enable TimestampQuery if available; fail-open (skip timing if not).
   std::vector<WGPUFeatureName> required_features;
   if (wgpuAdapterHasFeature(ctx.adapter, WGPUFeatureName_TimestampQuery)) {
@@ -147,6 +150,7 @@ WebGPUContext create_webgpu_context() {
     device_desc.requiredFeatures = required_features.data();
     ctx.timestamp_supported = true;
   }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 
   device_desc.uncapturedErrorCallbackInfo.callback = on_device_error;
 
@@ -203,6 +207,10 @@ WebGPUContext* get_default_webgpu_context() {
 }
 
 void destroy_webgpu_context(WebGPUContext& ctx) {
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
+  // Release device-child GPU resources before the device handle.
+  ctx.querypool.reset();
+#endif // WGPU_BACKEND_ENABLE_PROFILING
   if (ctx.queue) {
     wgpuQueueRelease(ctx.queue);
     ctx.queue = nullptr;
diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h
index 90100fa831a..a332edef443 100644
--- a/backends/webgpu/runtime/WebGPUDevice.h
+++ b/backends/webgpu/runtime/WebGPUDevice.h
@@ -10,9 +10,11 @@
 
 #include <webgpu/webgpu.h>
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
 #include <executorch/backends/webgpu/runtime/WebGPUQueryPool.h>
 
 #include <memory>
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 
 namespace executorch {
 namespace backends {
@@ -23,10 +25,12 @@ struct WebGPUContext {
   WGPUAdapter adapter = nullptr;
   WGPUDevice device = nullptr;
   WGPUQueue queue = nullptr;
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
   // True if the device was created with the TimestampQuery feature (bench).
   bool timestamp_supported = false;
   // Bench-only: timestamp-query pool, lazily created in execute() (env-gated).
   std::unique_ptr<WebGPUQueryPool> querypool;
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 };
 
 WebGPUContext create_webgpu_context();
diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
index 410df4d89dc..1c977d130dd 100644
--- a/backends/webgpu/runtime/WebGPUGraph.cpp
+++ b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -498,10 +498,15 @@ void WebGPUGraph::copy_inputs(
 }
 
 namespace {
-// Bench gate: WEBGPU_TIMESTAMP_QUERY enables per-pass GPU timestamp queries.
+// Bench gate: compiled out unless WGPU_BACKEND_ENABLE_PROFILING; then the
+// WEBGPU_TIMESTAMP_QUERY env var enables per-pass GPU timestamp queries.
 bool should_timestamp_query() {
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
   static const bool enabled = std::getenv("WEBGPU_TIMESTAMP_QUERY") != nullptr;
   return enabled;
+#else
+  return false;
+#endif
 }
 } // namespace
 
@@ -510,6 +515,7 @@ void WebGPUGraph::execute() {
   const size_t chunk = execute_config_.chunk_size;
 
   if (chunk == 0 || n <= chunk) {
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
     // Bench: timestamp-query pool, null unless env-gated + feature present.
     WebGPUQueryPool* qp = nullptr;
     if (should_timestamp_query() && n > 0) {
@@ -524,6 +530,7 @@ void WebGPUGraph::execute() {
         }
       }
     }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 
     WGPUCommandEncoderDescriptor enc_desc = {};
     WGPUCommandEncoder encoder =
@@ -532,13 +539,15 @@ void WebGPUGraph::execute() {
     // One pass per dispatch: enforces storage RAW ordering across deps.
     for (size_t i = 0; i < n; i++) {
       const auto& dispatch = dispatches_[i];
+      WGPUComputePassDescriptor pass_desc = {};
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
       // tw must outlive BeginComputePass (the descriptor points at it).
       WGPUPassTimestampWrites tw = {};
-      WGPUComputePassDescriptor pass_desc = {};
       if (qp) {
         tw = qp->writes_for(static_cast<uint32_t>(i));
         pass_desc.timestampWrites = &tw;
       }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
       WGPUComputePassEncoder pass =
           wgpuCommandEncoderBeginComputePass(encoder, &pass_desc);
       wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline);
@@ -548,6 +557,7 @@ void WebGPUGraph::execute() {
           pass, dispatch.workgroup_count_x, 1, 1);
       wgpuComputePassEncoderEnd(pass);
       wgpuComputePassEncoderRelease(pass);
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
       if (qp) {
         qp->record(
             static_cast<uint32_t>(i),
@@ -555,6 +565,7 @@ void WebGPUGraph::execute() {
             {dispatch.workgroup_count_x, 1, 1},
             {1, 1, 1});
       }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
     }
 
     for (const auto& copy : output_copies_) {
@@ -562,9 +573,11 @@ void WebGPUGraph::execute() {
           encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes);
     }
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
     if (qp) {
       qp->resolve(encoder);
     }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 
     WGPUCommandBufferDescriptor cmd_desc = {};
     WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
@@ -573,10 +586,12 @@ void WebGPUGraph::execute() {
     wgpuCommandBufferRelease(cmd);
     wgpuCommandEncoderRelease(encoder);
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
     if (qp) {
       qp->extract_results(instance_);
       qp->print_results();
     }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
     return;
   }
 
diff --git a/backends/webgpu/runtime/WebGPUQueryPool.cpp b/backends/webgpu/runtime/WebGPUQueryPool.cpp
index 0e5c583337c..89e08a2afce 100644
--- a/backends/webgpu/runtime/WebGPUQueryPool.cpp
+++ b/backends/webgpu/runtime/WebGPUQueryPool.cpp
@@ -16,6 +16,8 @@
 
 namespace executorch::backends::webgpu {
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
+
 namespace {
 
 struct MapCallbackData {
@@ -217,4 +219,6 @@ uint64_t WebGPUQueryPool::get_mean_shader_ns(
   return n == 0 ? 0 : sum / n;
 }
 
+#endif // WGPU_BACKEND_ENABLE_PROFILING
+
 } // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/runtime/WebGPUQueryPool.h b/backends/webgpu/runtime/WebGPUQueryPool.h
index ca0dd67e0a4..9e5d6cb788c 100644
--- a/backends/webgpu/runtime/WebGPUQueryPool.h
+++ b/backends/webgpu/runtime/WebGPUQueryPool.h
@@ -17,6 +17,8 @@
 
 namespace executorch::backends::webgpu {
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
+
 // Per-dispatch GPU timing; mirrors Vulkan QueryPool ShaderDuration.
 struct ShaderDuration {
   uint32_t idx = 0;
@@ -81,4 +83,6 @@ class WebGPUQueryPool {
   std::vector<ShaderDuration> durations_;
 };
 
+#endif // WGPU_BACKEND_ENABLE_PROFILING
+
 } // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
index 1a7224b80f6..e62d6f2b53c 100644
--- a/backends/webgpu/test/test_webgpu_native.cpp
+++ b/backends/webgpu/test/test_webgpu_native.cpp
@@ -133,6 +133,7 @@ static bool test_chained_add(const std::string& model_path) {
   return true;
 }
 
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
 // Capacity-overrun must throw; runs without a device or TimestampQuery.
 static bool test_query_pool_overrun_throws() {
   printf("\n--- Test: WebGPUQueryPool capacity-overrun guard ---\n");
@@ -255,6 +256,7 @@ static bool test_query_pool_roundtrip(const WebGPUContext& ctx) {
   printf("PASS: WebGPUQueryPool roundtrip -- non-zero GPU kernel duration\n");
   return true;
 }
+#endif // WGPU_BACKEND_ENABLE_PROFILING
 
 int main(int argc, char** argv) {
   std::string model_path = "webgpu_add_test.pte";
@@ -281,8 +283,11 @@ int main(int argc, char** argv) {
   set_default_webgpu_context(&ctx);
   printf("WebGPU device acquired (native)\n");
 
-  bool ok = test_query_pool_overrun_throws();
+  bool ok = true;
+#ifdef WGPU_BACKEND_ENABLE_PROFILING
+  ok = test_query_pool_overrun_throws() && ok;
   ok = test_query_pool_roundtrip(ctx) && ok;
+#endif // WGPU_BACKEND_ENABLE_PROFILING
   ok = test_single_add(model_path) && ok;
 
   if (!chained_model_path.empty()) {

From cea23057dd36a2c154c6b4e1af4820d6b14bedb5 Mon Sep 17 00:00:00 2001
From: Julian Ng-Thow-Hing <juliannth@meta.com>
Date: Fri, 12 Jun 2026 14:15:12 -0700
Subject: [PATCH 3/3] Update

[ghstack-poisoned]
---
 backends/webgpu/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt
index da00858ab92..1fc0860fc4b 100644
--- a/backends/webgpu/CMakeLists.txt
+++ b/backends/webgpu/CMakeLists.txt
@@ -83,7 +83,9 @@ option(EXECUTORCH_BUILD_WEBGPU_PROFILING
        "Enable WebGPU GPU timestamp-query profiling" OFF
 )
 if(EXECUTORCH_BUILD_WEBGPU_PROFILING)
-  target_compile_definitions(webgpu_backend PRIVATE WGPU_BACKEND_ENABLE_PROFILING)
+  target_compile_definitions(
+    webgpu_backend PRIVATE WGPU_BACKEND_ENABLE_PROFILING
+  )
 endif()
 
 # Link with --whole-archive for static registration of backend + ops