1515#include < executorch/backends/webgpu/runtime/WebGPUCompat.h>
1616#include < executorch/backends/webgpu/runtime/WebGPUDevice.h>
1717
18+ #include < cstdlib>
1819#include < cstring>
1920#include < stdexcept>
2021
@@ -496,18 +497,48 @@ void WebGPUGraph::copy_inputs(
496497 }
497498}
498499
500+ namespace {
501+ // Bench gate: WEBGPU_TIMESTAMP_QUERY enables per-pass GPU timestamp queries.
502+ bool should_timestamp_query () {
503+ static const bool enabled = std::getenv (" WEBGPU_TIMESTAMP_QUERY" ) != nullptr ;
504+ return enabled;
505+ }
506+ } // namespace
507+
499508void WebGPUGraph::execute () {
500509 const size_t n = dispatches_.size ();
501510 const size_t chunk = execute_config_.chunk_size ;
502511
503512 if (chunk == 0 || n <= chunk) {
513+ // Bench: timestamp-query pool, null unless env-gated + feature present.
514+ WebGPUQueryPool* qp = nullptr ;
515+ if (should_timestamp_query () && n > 0 ) {
516+ if (auto * ctx = get_default_webgpu_context ()) {
517+ if (ctx->timestamp_supported ) {
518+ if (!ctx->querypool || ctx->querypool ->capacity () < n) {
519+ ctx->querypool = std::make_unique<WebGPUQueryPool>();
520+ ctx->querypool ->initialize (device_, static_cast <uint32_t >(n));
521+ }
522+ qp = ctx->querypool .get ();
523+ qp->reset (static_cast <uint32_t >(n));
524+ }
525+ }
526+ }
527+
504528 WGPUCommandEncoderDescriptor enc_desc = {};
505529 WGPUCommandEncoder encoder =
506530 wgpuDeviceCreateCommandEncoder (device_, &enc_desc);
507531
508532 // One pass per dispatch: enforces storage RAW ordering across deps.
509- for (const auto & dispatch : dispatches_) {
533+ for (size_t i = 0 ; i < n; i++) {
534+ const auto & dispatch = dispatches_[i];
535+ // tw must outlive BeginComputePass (the descriptor points at it).
536+ WGPUPassTimestampWrites tw = {};
510537 WGPUComputePassDescriptor pass_desc = {};
538+ if (qp) {
539+ tw = qp->writes_for (static_cast <uint32_t >(i));
540+ pass_desc.timestampWrites = &tw;
541+ }
511542 WGPUComputePassEncoder pass =
512543 wgpuCommandEncoderBeginComputePass (encoder, &pass_desc);
513544 wgpuComputePassEncoderSetPipeline (pass, dispatch.pipeline );
@@ -517,22 +548,45 @@ void WebGPUGraph::execute() {
517548 pass, dispatch.workgroup_count_x , 1 , 1 );
518549 wgpuComputePassEncoderEnd (pass);
519550 wgpuComputePassEncoderRelease (pass);
551+ if (qp) {
552+ qp->record (
553+ static_cast <uint32_t >(i),
554+ dispatch.kernel_name ,
555+ {dispatch.workgroup_count_x , 1 , 1 },
556+ {1 , 1 , 1 });
557+ }
520558 }
521559
522560 for (const auto & copy : output_copies_) {
523561 wgpuCommandEncoderCopyBufferToBuffer (
524562 encoder, copy.src_buffer , 0 , copy.staging_buffer , 0 , copy.nbytes );
525563 }
526564
565+ if (qp) {
566+ qp->resolve (encoder);
567+ }
568+
527569 WGPUCommandBufferDescriptor cmd_desc = {};
528570 WGPUCommandBuffer cmd = wgpuCommandEncoderFinish (encoder, &cmd_desc);
529571 wgpuQueueSubmit (queue_, 1 , &cmd);
530572
531573 wgpuCommandBufferRelease (cmd);
532574 wgpuCommandEncoderRelease (encoder);
575+
576+ if (qp) {
577+ qp->extract_results (instance_);
578+ qp->print_results ();
579+ }
533580 return ;
534581 }
535582
583+ // GPU timestamp queries assume one submit; chunked execute is multi-submit.
584+ if (should_timestamp_query ()) {
585+ throw std::runtime_error (
586+ " WebGPU: WEBGPU_TIMESTAMP_QUERY is incompatible with chunked execute "
587+ " (multi-submit); disable chunking to use GPU timestamp queries" );
588+ }
589+
536590 const size_t first_chunk = execute_config_.initial_chunk_size > 0
537591 ? execute_config_.initial_chunk_size
538592 : chunk;
0 commit comments