From 772fde70e73a152bcac4c0de1ab60dceaa4a0bc0 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Wed, 10 Jun 2026 13:19:20 -0700 Subject: [PATCH 1/3] Update [ghstack-poisoned] --- backends/webgpu/CMakeLists.txt | 1 + backends/webgpu/runtime/WebGPUDevice.cpp | 11 + backends/webgpu/runtime/WebGPUDevice.h | 8 + backends/webgpu/runtime/WebGPUGraph.cpp | 56 ++++- backends/webgpu/runtime/WebGPUGraph.h | 1 + backends/webgpu/runtime/WebGPUQueryPool.cpp | 220 ++++++++++++++++++++ backends/webgpu/runtime/WebGPUQueryPool.h | 84 ++++++++ 7 files changed, 380 insertions(+), 1 deletion(-) create mode 100644 backends/webgpu/runtime/WebGPUQueryPool.cpp create mode 100644 backends/webgpu/runtime/WebGPUQueryPool.h diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 9b1476f2290..9847763dba5 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -30,6 +30,7 @@ set(WEBGPU_SRCS runtime/WebGPUGraph.cpp runtime/WebGPUDelegateHeader.cpp runtime/WebGPUDevice.cpp + runtime/WebGPUQueryPool.cpp runtime/ops/OperatorRegistry.cpp runtime/ops/add/BinaryOp.cpp runtime/ops/rms_norm/RmsNorm.cpp diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp index 041cbe5a703..6672459320d 100644 --- a/backends/webgpu/runtime/WebGPUDevice.cpp +++ b/backends/webgpu/runtime/WebGPUDevice.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace executorch { namespace backends { @@ -137,6 +138,16 @@ WebGPUContext create_webgpu_context() { WGPUStatus_Success) { device_desc.requiredLimits = &supported_limits; } + + // Bench: enable TimestampQuery if available; fail-open (skip timing if not). + std::vector required_features; + if (wgpuAdapterHasFeature(ctx.adapter, WGPUFeatureName_TimestampQuery)) { + required_features.push_back(WGPUFeatureName_TimestampQuery); + device_desc.requiredFeatureCount = required_features.size(); + device_desc.requiredFeatures = required_features.data(); + ctx.timestamp_supported = true; + } + device_desc.uncapturedErrorCallbackInfo.callback = on_device_error; WGPUWaitStatus device_wait = webgpu_wait( diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h index 78afd96316a..90100fa831a 100644 --- a/backends/webgpu/runtime/WebGPUDevice.h +++ b/backends/webgpu/runtime/WebGPUDevice.h @@ -10,6 +10,10 @@ #include +#include + +#include + namespace executorch { namespace backends { namespace webgpu { @@ -19,6 +23,10 @@ struct WebGPUContext { WGPUAdapter adapter = nullptr; WGPUDevice device = nullptr; WGPUQueue queue = nullptr; + // True if the device was created with the TimestampQuery feature (bench). + bool timestamp_supported = false; + // Bench-only: timestamp-query pool, lazily created in execute() (env-gated). + std::unique_ptr querypool; }; WebGPUContext create_webgpu_context(); diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index b3ae5511d13..410df4d89dc 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -15,6 +15,7 @@ #include #include +#include #include #include @@ -496,18 +497,48 @@ void WebGPUGraph::copy_inputs( } } +namespace { +// Bench gate: WEBGPU_TIMESTAMP_QUERY enables per-pass GPU timestamp queries. +bool should_timestamp_query() { + static const bool enabled = std::getenv("WEBGPU_TIMESTAMP_QUERY") != nullptr; + return enabled; +} +} // namespace + void WebGPUGraph::execute() { const size_t n = dispatches_.size(); const size_t chunk = execute_config_.chunk_size; if (chunk == 0 || n <= chunk) { + // Bench: timestamp-query pool, null unless env-gated + feature present. + WebGPUQueryPool* qp = nullptr; + if (should_timestamp_query() && n > 0) { + if (auto* ctx = get_default_webgpu_context()) { + if (ctx->timestamp_supported) { + if (!ctx->querypool || ctx->querypool->capacity() < n) { + ctx->querypool = std::make_unique(); + ctx->querypool->initialize(device_, static_cast(n)); + } + qp = ctx->querypool.get(); + qp->reset(static_cast(n)); + } + } + } + WGPUCommandEncoderDescriptor enc_desc = {}; WGPUCommandEncoder encoder = wgpuDeviceCreateCommandEncoder(device_, &enc_desc); // One pass per dispatch: enforces storage RAW ordering across deps. - for (const auto& dispatch : dispatches_) { + for (size_t i = 0; i < n; i++) { + const auto& dispatch = dispatches_[i]; + // tw must outlive BeginComputePass (the descriptor points at it). + WGPUPassTimestampWrites tw = {}; WGPUComputePassDescriptor pass_desc = {}; + if (qp) { + tw = qp->writes_for(static_cast(i)); + pass_desc.timestampWrites = &tw; + } WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); @@ -517,6 +548,13 @@ void WebGPUGraph::execute() { pass, dispatch.workgroup_count_x, 1, 1); wgpuComputePassEncoderEnd(pass); wgpuComputePassEncoderRelease(pass); + if (qp) { + qp->record( + static_cast(i), + dispatch.kernel_name, + {dispatch.workgroup_count_x, 1, 1}, + {1, 1, 1}); + } } for (const auto& copy : output_copies_) { @@ -524,15 +562,31 @@ void WebGPUGraph::execute() { encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); } + if (qp) { + qp->resolve(encoder); + } + WGPUCommandBufferDescriptor cmd_desc = {}; WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); wgpuQueueSubmit(queue_, 1, &cmd); wgpuCommandBufferRelease(cmd); wgpuCommandEncoderRelease(encoder); + + if (qp) { + qp->extract_results(instance_); + qp->print_results(); + } return; } + // GPU timestamp queries assume one submit; chunked execute is multi-submit. + if (should_timestamp_query()) { + throw std::runtime_error( + "WebGPU: WEBGPU_TIMESTAMP_QUERY is incompatible with chunked execute " + "(multi-submit); disable chunking to use GPU timestamp queries"); + } + const size_t first_chunk = execute_config_.initial_chunk_size > 0 ? execute_config_.initial_chunk_size : chunk; diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 9f656ce4d14..92aa14d59b6 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -31,6 +31,7 @@ struct WebGPUDispatch { WGPUComputePipeline pipeline = nullptr; WGPUBindGroup bind_group = nullptr; uint32_t workgroup_count_x = 1; + std::string kernel_name; // bench label }; struct OutputCopy { diff --git a/backends/webgpu/runtime/WebGPUQueryPool.cpp b/backends/webgpu/runtime/WebGPUQueryPool.cpp new file mode 100644 index 00000000000..0e5c583337c --- /dev/null +++ b/backends/webgpu/runtime/WebGPUQueryPool.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +namespace { + +struct MapCallbackData { + WGPUMapAsyncStatus status = WGPUMapAsyncStatus_Error; +}; + +void map_callback( + WGPUMapAsyncStatus status, + WGPUStringView /*message*/, + void* userdata1, + void* /*userdata2*/) { + auto* data = static_cast(userdata1); + data->status = status; +} + +constexpr uint64_t kTimestampBytes = sizeof(uint64_t); + +} // namespace + +WebGPUQueryPool::~WebGPUQueryPool() { + if (readback_buf_) { + wgpuBufferRelease(readback_buf_); + } + if (resolve_buf_) { + wgpuBufferRelease(resolve_buf_); + } + if (qset_) { + wgpuQuerySetRelease(qset_); + } +} + +void WebGPUQueryPool::initialize(WGPUDevice device, uint32_t max_pairs) { + if (max_pairs == 0) { + return; + } + // Re-init guard; mirrors Vulkan QueryPool (avoids leaking a prior QuerySet). + if (qset_ != nullptr) { + return; + } + capacity_pairs_ = max_pairs; + const uint32_t count = 2 * max_pairs; + const uint64_t bytes = static_cast(count) * kTimestampBytes; + + WGPUQuerySetDescriptor qsd = {}; + qsd.type = WGPUQueryType_Timestamp; + qsd.count = count; + qset_ = wgpuDeviceCreateQuerySet(device, &qsd); + + WGPUBufferDescriptor rbd = {}; + rbd.size = bytes; + rbd.usage = WGPUBufferUsage_QueryResolve | WGPUBufferUsage_CopySrc; + resolve_buf_ = wgpuDeviceCreateBuffer(device, &rbd); + + WGPUBufferDescriptor mbd = {}; + mbd.size = bytes; + mbd.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; + readback_buf_ = wgpuDeviceCreateBuffer(device, &mbd); + // WebGPU timestamps are already nanoseconds, so ns_per_tick_ stays 1.0. +} + +void WebGPUQueryPool::reset(uint32_t num_dispatches) { + // Fail loud on overrun; mirrors Vulkan QueryPool VK_CHECK_COND guard. + if (num_dispatches > capacity_pairs_) { + throw std::runtime_error( + "WebGPUQueryPool: num_dispatches " + std::to_string(num_dispatches) + + " exceeds capacity " + std::to_string(capacity_pairs_)); + } + num_pairs_ = num_dispatches; + durations_.clear(); +} + +WGPUPassTimestampWrites WebGPUQueryPool::writes_for(uint32_t i) { + WGPUPassTimestampWrites tw = {}; + tw.querySet = qset_; + tw.beginningOfPassWriteIndex = 2 * i; + tw.endOfPassWriteIndex = 2 * i + 1; + return tw; +} + +void WebGPUQueryPool::record( + uint32_t i, + const std::string& name, + std::array gwg, + std::array lwg) { + ShaderDuration d; + d.idx = i; + d.kernel_name = name; + d.global_wg = gwg; + d.local_wg = lwg; + durations_.push_back(d); +} + +void WebGPUQueryPool::resolve(WGPUCommandEncoder encoder) { + if (num_pairs_ == 0) { + return; + } + const uint32_t count = 2 * num_pairs_; + wgpuCommandEncoderResolveQuerySet(encoder, qset_, 0, count, resolve_buf_, 0); + wgpuCommandEncoderCopyBufferToBuffer( + encoder, + resolve_buf_, + 0, + readback_buf_, + 0, + static_cast(count) * kTimestampBytes); +} + +void WebGPUQueryPool::extract_results(WGPUInstance instance) { + if (num_pairs_ == 0) { + return; + } + const uint32_t count = 2 * num_pairs_; + const uint64_t bytes = static_cast(count) * kTimestampBytes; + + MapCallbackData cb; + WGPUBufferMapCallbackInfo cb_info = {}; + cb_info.mode = WGPUCallbackMode_WaitAnyOnly; + cb_info.callback = map_callback; + cb_info.userdata1 = &cb; + webgpu_wait( + instance, + wgpuBufferMapAsync(readback_buf_, WGPUMapMode_Read, 0, bytes, cb_info)); + + if (cb.status != WGPUMapAsyncStatus_Success) { + printf( + "WebGPUQueryPool: readback map failed (status %d)\n", (int)cb.status); + return; + } + const uint64_t* ticks = static_cast( + wgpuBufferGetConstMappedRange(readback_buf_, 0, bytes)); + if (ticks != nullptr) { + for (auto& d : durations_) { + const uint64_t t0 = ticks[2 * d.idx]; + const uint64_t t1 = ticks[2 * d.idx + 1]; + d.start_time_ns = static_cast(t0 * ns_per_tick_); + d.end_time_ns = static_cast(t1 * ns_per_tick_); + d.execution_duration_ns = + (t1 >= t0) ? static_cast((t1 - t0) * ns_per_tick_) : 0; + } + } + wgpuBufferUnmap(readback_buf_); +} + +void WebGPUQueryPool::print_results(bool tsv) const { + const char* sep = tsv ? "\t" : " "; + if (tsv) { + printf("idx%skernel%sgwg%sduration_us\n", sep, sep, sep); + } else { + printf("=== WebGPUQueryPool: per-dispatch GPU time ===\n"); + } + for (const auto& d : durations_) { + const double us = d.execution_duration_ns / 1000.0; + printf( + "%u%s%s%s(%u,%u,%u)%s%.3f\n", + d.idx, + sep, + d.kernel_name.empty() ? "dispatch" : d.kernel_name.c_str(), + sep, + d.global_wg[0], + d.global_wg[1], + d.global_wg[2], + sep, + us); + } + if (tsv) { + return; + } + std::map> totals; + for (const auto& d : durations_) { + auto& t = totals[d.kernel_name.empty() ? "dispatch" : d.kernel_name]; + t.first += d.execution_duration_ns; + t.second += 1; + } + printf("--- per-kernel mean / total (us) ---\n"); + for (const auto& kv : totals) { + const double mean_us = kv.second.first / kv.second.second / 1000.0; + const double total_us = kv.second.first / 1000.0; + printf( + "%s%smean %.3f%stotal %.3f (n=%u)\n", + kv.first.c_str(), + sep, + mean_us, + sep, + total_us, + kv.second.second); + } +} + +uint64_t WebGPUQueryPool::get_mean_shader_ns( + const std::string& kernel_name) const { + uint64_t sum = 0; + uint32_t n = 0; + for (const auto& d : durations_) { + if (d.kernel_name == kernel_name) { + sum += d.execution_duration_ns; + n += 1; + } + } + return n == 0 ? 0 : sum / n; +} + +} // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/WebGPUQueryPool.h b/backends/webgpu/runtime/WebGPUQueryPool.h new file mode 100644 index 00000000000..ca0dd67e0a4 --- /dev/null +++ b/backends/webgpu/runtime/WebGPUQueryPool.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include +#include +#include + +namespace executorch::backends::webgpu { + +// Per-dispatch GPU timing; mirrors Vulkan QueryPool ShaderDuration. +struct ShaderDuration { + uint32_t idx = 0; + std::string kernel_name; + std::array global_wg{}; + std::array local_wg{}; + uint64_t start_time_ns = 0; + uint64_t end_time_ns = 0; + uint64_t execution_duration_ns = 0; +}; + +// GPU timestamp-query pool; re-port of Vulkan vk_api/QueryPool. +class WebGPUQueryPool { + public: + WebGPUQueryPool() = default; + ~WebGPUQueryPool(); + + WebGPUQueryPool(const WebGPUQueryPool&) = delete; + WebGPUQueryPool& operator=(const WebGPUQueryPool&) = delete; + + // Create the QuerySet + readback buffers; query the ns-per-tick period. + void initialize(WGPUDevice device, uint32_t max_pairs); + bool is_initialized() const { + return qset_ != nullptr; + } + uint32_t capacity() const { + return capacity_pairs_; + } + + // Clear durations and set the dispatch count for this run. + void reset(uint32_t num_dispatches); + + // timestampWrites for pass i: begin=2i, end=2i+1. + WGPUPassTimestampWrites writes_for(uint32_t i); + + // Record pass i's label + workgroup sizes (start/end filled by extract). + void record( + uint32_t i, + const std::string& name, + std::array gwg, + std::array lwg); + + // Resolve the QuerySet into the readback buffer; call before submit. + void resolve(WGPUCommandEncoder encoder); + + // Map the readback, convert ticks->ns, fill durations; call after submit. + void extract_results(WGPUInstance instance); + + const std::vector& results() const { + return durations_; + } + void print_results(bool tsv = false) const; + uint64_t get_mean_shader_ns(const std::string& kernel_name) const; + + private: + WGPUQuerySet qset_ = nullptr; + WGPUBuffer resolve_buf_ = nullptr; // QueryResolve | CopySrc + WGPUBuffer readback_buf_ = nullptr; // MapRead | CopyDst + uint32_t capacity_pairs_ = 0; + uint32_t num_pairs_ = 0; + double ns_per_tick_ = 1.0; // WebGPU timestamps are already nanoseconds + std::vector durations_; +}; + +} // namespace executorch::backends::webgpu From 66e8fa48ead991d51822c7e5c59576c79f262e22 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Fri, 12 Jun 2026 13:31:34 -0700 Subject: [PATCH 2/3] Update [ghstack-poisoned] --- backends/webgpu/CMakeLists.txt | 14 ++++++++++++++ backends/webgpu/runtime/WebGPUDevice.cpp | 8 ++++++++ backends/webgpu/runtime/WebGPUDevice.h | 4 ++++ backends/webgpu/runtime/WebGPUGraph.cpp | 19 +++++++++++++++++-- backends/webgpu/runtime/WebGPUQueryPool.cpp | 4 ++++ backends/webgpu/runtime/WebGPUQueryPool.h | 4 ++++ backends/webgpu/test/test_webgpu_native.cpp | 7 ++++++- 7 files changed, 57 insertions(+), 3 deletions(-) diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index 9847763dba5..da00858ab92 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -77,6 +77,15 @@ endif() target_compile_options(webgpu_backend PRIVATE -fexceptions) +# Opt-in GPU timestamp profiling (WebGPUQueryPool); OFF so production builds +# request no TimestampQuery device feature. Mirrors Vulkan's compile-flag gate. +option(EXECUTORCH_BUILD_WEBGPU_PROFILING + "Enable WebGPU GPU timestamp-query profiling" OFF +) +if(EXECUTORCH_BUILD_WEBGPU_PROFILING) + target_compile_definitions(webgpu_backend PRIVATE WGPU_BACKEND_ENABLE_PROFILING) +endif() + # Link with --whole-archive for static registration of backend + ops executorch_target_link_options_shared_lib(webgpu_backend) @@ -115,6 +124,11 @@ function(add_webgpu_native_test test_name test_src) target_link_libraries(${test_name} PRIVATE dl m pthread) endif() target_compile_options(${test_name} PRIVATE -fexceptions) + if(EXECUTORCH_BUILD_WEBGPU_PROFILING) + target_compile_definitions( + ${test_name} PRIVATE WGPU_BACKEND_ENABLE_PROFILING + ) + endif() set_property(TARGET ${test_name} PROPERTY CXX_STANDARD 17) endfunction() diff --git a/backends/webgpu/runtime/WebGPUDevice.cpp b/backends/webgpu/runtime/WebGPUDevice.cpp index 6672459320d..e69101851a2 100644 --- a/backends/webgpu/runtime/WebGPUDevice.cpp +++ b/backends/webgpu/runtime/WebGPUDevice.cpp @@ -13,7 +13,9 @@ #include #include #include +#ifdef WGPU_BACKEND_ENABLE_PROFILING #include +#endif // WGPU_BACKEND_ENABLE_PROFILING namespace executorch { namespace backends { @@ -139,6 +141,7 @@ WebGPUContext create_webgpu_context() { device_desc.requiredLimits = &supported_limits; } +#ifdef WGPU_BACKEND_ENABLE_PROFILING // Bench: enable TimestampQuery if available; fail-open (skip timing if not). std::vector required_features; if (wgpuAdapterHasFeature(ctx.adapter, WGPUFeatureName_TimestampQuery)) { @@ -147,6 +150,7 @@ WebGPUContext create_webgpu_context() { device_desc.requiredFeatures = required_features.data(); ctx.timestamp_supported = true; } +#endif // WGPU_BACKEND_ENABLE_PROFILING device_desc.uncapturedErrorCallbackInfo.callback = on_device_error; @@ -203,6 +207,10 @@ WebGPUContext* get_default_webgpu_context() { } void destroy_webgpu_context(WebGPUContext& ctx) { +#ifdef WGPU_BACKEND_ENABLE_PROFILING + // Release device-child GPU resources before the device handle. + ctx.querypool.reset(); +#endif // WGPU_BACKEND_ENABLE_PROFILING if (ctx.queue) { wgpuQueueRelease(ctx.queue); ctx.queue = nullptr; diff --git a/backends/webgpu/runtime/WebGPUDevice.h b/backends/webgpu/runtime/WebGPUDevice.h index 90100fa831a..a332edef443 100644 --- a/backends/webgpu/runtime/WebGPUDevice.h +++ b/backends/webgpu/runtime/WebGPUDevice.h @@ -10,9 +10,11 @@ #include +#ifdef WGPU_BACKEND_ENABLE_PROFILING #include #include +#endif // WGPU_BACKEND_ENABLE_PROFILING namespace executorch { namespace backends { @@ -23,10 +25,12 @@ struct WebGPUContext { WGPUAdapter adapter = nullptr; WGPUDevice device = nullptr; WGPUQueue queue = nullptr; +#ifdef WGPU_BACKEND_ENABLE_PROFILING // True if the device was created with the TimestampQuery feature (bench). bool timestamp_supported = false; // Bench-only: timestamp-query pool, lazily created in execute() (env-gated). std::unique_ptr querypool; +#endif // WGPU_BACKEND_ENABLE_PROFILING }; WebGPUContext create_webgpu_context(); diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index 410df4d89dc..1c977d130dd 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -498,10 +498,15 @@ void WebGPUGraph::copy_inputs( } namespace { -// Bench gate: WEBGPU_TIMESTAMP_QUERY enables per-pass GPU timestamp queries. +// Bench gate: compiled out unless WGPU_BACKEND_ENABLE_PROFILING; then the +// WEBGPU_TIMESTAMP_QUERY env var enables per-pass GPU timestamp queries. bool should_timestamp_query() { +#ifdef WGPU_BACKEND_ENABLE_PROFILING static const bool enabled = std::getenv("WEBGPU_TIMESTAMP_QUERY") != nullptr; return enabled; +#else + return false; +#endif } } // namespace @@ -510,6 +515,7 @@ void WebGPUGraph::execute() { const size_t chunk = execute_config_.chunk_size; if (chunk == 0 || n <= chunk) { +#ifdef WGPU_BACKEND_ENABLE_PROFILING // Bench: timestamp-query pool, null unless env-gated + feature present. WebGPUQueryPool* qp = nullptr; if (should_timestamp_query() && n > 0) { @@ -524,6 +530,7 @@ void WebGPUGraph::execute() { } } } +#endif // WGPU_BACKEND_ENABLE_PROFILING WGPUCommandEncoderDescriptor enc_desc = {}; WGPUCommandEncoder encoder = @@ -532,13 +539,15 @@ void WebGPUGraph::execute() { // One pass per dispatch: enforces storage RAW ordering across deps. for (size_t i = 0; i < n; i++) { const auto& dispatch = dispatches_[i]; + WGPUComputePassDescriptor pass_desc = {}; +#ifdef WGPU_BACKEND_ENABLE_PROFILING // tw must outlive BeginComputePass (the descriptor points at it). WGPUPassTimestampWrites tw = {}; - WGPUComputePassDescriptor pass_desc = {}; if (qp) { tw = qp->writes_for(static_cast(i)); pass_desc.timestampWrites = &tw; } +#endif // WGPU_BACKEND_ENABLE_PROFILING WGPUComputePassEncoder pass = wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); @@ -548,6 +557,7 @@ void WebGPUGraph::execute() { pass, dispatch.workgroup_count_x, 1, 1); wgpuComputePassEncoderEnd(pass); wgpuComputePassEncoderRelease(pass); +#ifdef WGPU_BACKEND_ENABLE_PROFILING if (qp) { qp->record( static_cast(i), @@ -555,6 +565,7 @@ void WebGPUGraph::execute() { {dispatch.workgroup_count_x, 1, 1}, {1, 1, 1}); } +#endif // WGPU_BACKEND_ENABLE_PROFILING } for (const auto& copy : output_copies_) { @@ -562,9 +573,11 @@ void WebGPUGraph::execute() { encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); } +#ifdef WGPU_BACKEND_ENABLE_PROFILING if (qp) { qp->resolve(encoder); } +#endif // WGPU_BACKEND_ENABLE_PROFILING WGPUCommandBufferDescriptor cmd_desc = {}; WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); @@ -573,10 +586,12 @@ void WebGPUGraph::execute() { wgpuCommandBufferRelease(cmd); wgpuCommandEncoderRelease(encoder); +#ifdef WGPU_BACKEND_ENABLE_PROFILING if (qp) { qp->extract_results(instance_); qp->print_results(); } +#endif // WGPU_BACKEND_ENABLE_PROFILING return; } diff --git a/backends/webgpu/runtime/WebGPUQueryPool.cpp b/backends/webgpu/runtime/WebGPUQueryPool.cpp index 0e5c583337c..89e08a2afce 100644 --- a/backends/webgpu/runtime/WebGPUQueryPool.cpp +++ b/backends/webgpu/runtime/WebGPUQueryPool.cpp @@ -16,6 +16,8 @@ namespace executorch::backends::webgpu { +#ifdef WGPU_BACKEND_ENABLE_PROFILING + namespace { struct MapCallbackData { @@ -217,4 +219,6 @@ uint64_t WebGPUQueryPool::get_mean_shader_ns( return n == 0 ? 0 : sum / n; } +#endif // WGPU_BACKEND_ENABLE_PROFILING + } // namespace executorch::backends::webgpu diff --git a/backends/webgpu/runtime/WebGPUQueryPool.h b/backends/webgpu/runtime/WebGPUQueryPool.h index ca0dd67e0a4..9e5d6cb788c 100644 --- a/backends/webgpu/runtime/WebGPUQueryPool.h +++ b/backends/webgpu/runtime/WebGPUQueryPool.h @@ -17,6 +17,8 @@ namespace executorch::backends::webgpu { +#ifdef WGPU_BACKEND_ENABLE_PROFILING + // Per-dispatch GPU timing; mirrors Vulkan QueryPool ShaderDuration. struct ShaderDuration { uint32_t idx = 0; @@ -81,4 +83,6 @@ class WebGPUQueryPool { std::vector durations_; }; +#endif // WGPU_BACKEND_ENABLE_PROFILING + } // namespace executorch::backends::webgpu diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index 1a7224b80f6..e62d6f2b53c 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -133,6 +133,7 @@ static bool test_chained_add(const std::string& model_path) { return true; } +#ifdef WGPU_BACKEND_ENABLE_PROFILING // Capacity-overrun must throw; runs without a device or TimestampQuery. static bool test_query_pool_overrun_throws() { printf("\n--- Test: WebGPUQueryPool capacity-overrun guard ---\n"); @@ -255,6 +256,7 @@ static bool test_query_pool_roundtrip(const WebGPUContext& ctx) { printf("PASS: WebGPUQueryPool roundtrip -- non-zero GPU kernel duration\n"); return true; } +#endif // WGPU_BACKEND_ENABLE_PROFILING int main(int argc, char** argv) { std::string model_path = "webgpu_add_test.pte"; @@ -281,8 +283,11 @@ int main(int argc, char** argv) { set_default_webgpu_context(&ctx); printf("WebGPU device acquired (native)\n"); - bool ok = test_query_pool_overrun_throws(); + bool ok = true; +#ifdef WGPU_BACKEND_ENABLE_PROFILING + ok = test_query_pool_overrun_throws() && ok; ok = test_query_pool_roundtrip(ctx) && ok; +#endif // WGPU_BACKEND_ENABLE_PROFILING ok = test_single_add(model_path) && ok; if (!chained_model_path.empty()) { From cea23057dd36a2c154c6b4e1af4820d6b14bedb5 Mon Sep 17 00:00:00 2001 From: Julian Ng-Thow-Hing Date: Fri, 12 Jun 2026 14:15:12 -0700 Subject: [PATCH 3/3] Update [ghstack-poisoned] --- backends/webgpu/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/webgpu/CMakeLists.txt b/backends/webgpu/CMakeLists.txt index da00858ab92..1fc0860fc4b 100644 --- a/backends/webgpu/CMakeLists.txt +++ b/backends/webgpu/CMakeLists.txt @@ -83,7 +83,9 @@ option(EXECUTORCH_BUILD_WEBGPU_PROFILING "Enable WebGPU GPU timestamp-query profiling" OFF ) if(EXECUTORCH_BUILD_WEBGPU_PROFILING) - target_compile_definitions(webgpu_backend PRIVATE WGPU_BACKEND_ENABLE_PROFILING) + target_compile_definitions( + webgpu_backend PRIVATE WGPU_BACKEND_ENABLE_PROFILING + ) endif() # Link with --whole-archive for static registration of backend + ops