Skip to content

Commit f0dff03

Browse files
[ExecuTorch][WebGPU] Add per-pass dispatch ordering + scratch buffer tests
Pull Request resolved: #20080 Native unit tests for two runtime enablers: per-pass compute-dispatch ordering (D107543258) and graph-owned scratch buffers (D107543259). `test/native/test_dispatch_order.cpp` exercises multi-dispatch read-after-write ordering through a single `execute()` using dependency chains -- a single-input `add` self-chain and a heterogeneous `rms_norm` -> `add` chain, both lowered via `VulkanPartitioner` -- comparing GPU output to a torch-computed golden per element. `test/native/test_scratch_buffer.cpp` is a white-box test of `WebGPUGraph::create_scratch_buffer` (no black-box consumer exists below the SDPA op): allocation + zero-size guard, copy round-trip, a compute Storage round-trip (its actual use), and a create/destroy lifecycle stress. Authored with assistance from Claude. ghstack-source-id: 391979580 @exported-using-ghexport Differential Revision: [D107576199](https://our.internmc.facebook.com/intern/diff/D107576199/)
1 parent f8bf776 commit f0dff03

8 files changed

Lines changed: 576 additions & 3 deletions

File tree

backends/webgpu/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,4 +118,10 @@ endfunction()
118118
if(EXECUTORCH_BUILD_WEBGPU_TEST)
119119
add_webgpu_native_test(webgpu_native_test test/test_webgpu_native.cpp)
120120
add_webgpu_native_test(webgpu_rms_norm_test test/native/test_rms_norm.cpp)
121+
add_webgpu_native_test(
122+
webgpu_dispatch_order_test test/native/test_dispatch_order.cpp
123+
)
124+
add_webgpu_native_test(
125+
webgpu_scratch_buffer_test test/native/test_scratch_buffer.cpp
126+
)
121127
endif()

backends/webgpu/scripts/test_webgpu_native_ci.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,14 @@ fi
3737
cd "${EXECUTORCH_ROOT}"
3838

3939
# ── Exports for the model-driven executables (best-effort) ───────────────────
40-
# native_test + rms_norm read .pte/golden inputs via WEBGPU_TEST_* env and
41-
# self-skip if absent; dispatch_order + scratch are standalone (no exports).
40+
# native_test + rms_norm + dispatch_order read .pte/golden inputs via env/dir and
41+
# self-skip if absent; scratch is standalone (generates its own inputs).
4242
PTE_MODEL="/tmp/webgpu_add_test.pte"
4343
PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte"
4444
RMS_NORM_DIR="/tmp/rmsn"
4545
RMS_NORM_OK=1
46+
DISPATCH_ORDER_DIR="/tmp/dispatch_order"
47+
DISPATCH_ORDER_OK=1
4648

4749
$PYTHON_EXECUTABLE -c "
4850
from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
@@ -55,6 +57,11 @@ from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rm
5557
export_rms_norm_cases('${RMS_NORM_DIR}')
5658
" || { echo "WARN: rms_norm export failed; skipping rms_norm native test"; RMS_NORM_OK=0; }
5759

60+
$PYTHON_EXECUTABLE -c "
61+
from executorch.backends.webgpu.test.ops.dispatch_order.test_dispatch_order import export_dispatch_order_cases
62+
export_dispatch_order_cases('${DISPATCH_ORDER_DIR}')
63+
" || { echo "WARN: dispatch_order export failed; skipping dispatch_order native test"; DISPATCH_ORDER_OK=0; }
64+
5865
# ── Configure (Dawn-only: no -DWEBGPU_IMPL; Dawn is the sole backend) ─────────
5966
echo "=== Configure WebGPU native tests on Dawn ==="
6067
rm -rf "${BUILD_DIR}"
@@ -115,7 +122,9 @@ fi
115122
if [[ "${RMS_NORM_OK}" == "1" && -x "${BIN_DIR}/webgpu_rms_norm_test" ]]; then
116123
"${BIN_DIR}/webgpu_rms_norm_test" "${RMS_NORM_DIR}"
117124
fi
118-
[[ -x "${BIN_DIR}/webgpu_dispatch_order_test" ]] && "${BIN_DIR}/webgpu_dispatch_order_test"
125+
if [[ "${DISPATCH_ORDER_OK}" == "1" && -x "${BIN_DIR}/webgpu_dispatch_order_test" ]]; then
126+
"${BIN_DIR}/webgpu_dispatch_order_test" "${DISPATCH_ORDER_DIR}"
127+
fi
119128
[[ -x "${BIN_DIR}/webgpu_scratch_buffer_test" ]] && "${BIN_DIR}/webgpu_scratch_buffer_test"
120129

121130
echo "=== WebGPU native tests on Dawn: all run targets passed ==="
Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/webgpu/runtime/WebGPUDevice.h>
10+
#include <executorch/extension/module/module.h>
11+
#include <executorch/extension/tensor/tensor.h>
12+
13+
#include <algorithm>
14+
#include <cmath>
15+
#include <cstdio>
16+
#include <cstdlib>
17+
#include <fstream>
18+
#include <string>
19+
#include <vector>
20+
21+
using namespace executorch::backends::webgpu;
22+
using namespace executorch::extension;
23+
using namespace executorch::runtime;
24+
25+
namespace {
26+
27+
struct Case {
28+
const char* name;
29+
std::vector<int32_t> sizes;
30+
};
31+
32+
// Mirrors _CASES in test_dispatch_order.py (add-chain or rms_norm+add chain).
33+
const std::vector<Case> kCases = {
34+
{"single", {16, 16}},
35+
{"chain3", {64, 64}},
36+
{"chain5_tiny", {1, 1}},
37+
{"chain5_wide", {7, 896}},
38+
{"chain8", {256, 256}},
39+
{"deep32", {128, 128}},
40+
{"large_chain", {1024, 1024}},
41+
{"het_small", {1, 1, 7, 896}},
42+
{"het_deep", {1, 1, 5, 256}},
43+
};
44+
45+
std::vector<float> read_f32_bin(const std::string& path) {
46+
std::ifstream f(path, std::ios::binary | std::ios::ate);
47+
if (!f) {
48+
return {};
49+
}
50+
const auto file_size = static_cast<size_t>(f.tellg());
51+
if (file_size % sizeof(float) != 0) {
52+
return {}; // truncated/corrupt golden; caller treats empty as failure
53+
}
54+
f.seekg(0);
55+
std::vector<float> data(file_size / sizeof(float));
56+
f.read(
57+
reinterpret_cast<char*>(data.data()),
58+
static_cast<std::streamsize>(file_size));
59+
return data;
60+
}
61+
62+
bool run_case(const std::string& dir, const Case& tc) {
63+
printf("\n--- dispatch_order[%s] ---\n", tc.name);
64+
const std::string base = dir + "/" + tc.name;
65+
std::vector<float> input = read_f32_bin(base + ".input.bin");
66+
std::vector<float> golden = read_f32_bin(base + ".golden.bin");
67+
if (input.empty() || golden.empty()) {
68+
printf("FAIL: could not read input/golden for %s\n", tc.name);
69+
return false;
70+
}
71+
72+
Module module(base + ".pte");
73+
if (module.load_forward() != Error::Ok) {
74+
printf("FAIL: could not load %s.pte\n", tc.name);
75+
return false;
76+
}
77+
78+
size_t expected = 1;
79+
for (int32_t d : tc.sizes) {
80+
expected *= static_cast<size_t>(d);
81+
}
82+
if (input.size() != expected) {
83+
printf(
84+
"FAIL: input numel %zu != expected %zu for %s\n",
85+
input.size(),
86+
expected,
87+
tc.name);
88+
return false;
89+
}
90+
auto x = make_tensor_ptr(tc.sizes, std::vector<float>(input));
91+
auto result = module.forward({EValue(x)});
92+
if (!result.ok()) {
93+
printf("FAIL: forward failed (error %d)\n", (int)result.error());
94+
return false;
95+
}
96+
const auto& outputs = result.get();
97+
if (outputs.empty() || !outputs[0].isTensor()) {
98+
printf("FAIL: no tensor output\n");
99+
return false;
100+
}
101+
const auto& out_tensor = outputs[0].toTensor();
102+
if (static_cast<size_t>(out_tensor.numel()) != golden.size()) {
103+
printf(
104+
"FAIL: output numel %zu != golden %zu\n",
105+
(size_t)out_tensor.numel(),
106+
golden.size());
107+
return false;
108+
}
109+
const float* out_data = out_tensor.const_data_ptr<float>();
110+
111+
float max_abs_err = 0.0f;
112+
float max_rel_err = 0.0f;
113+
for (size_t i = 0; i < golden.size(); i++) {
114+
const float abs_err = std::abs(out_data[i] - golden[i]);
115+
max_abs_err = std::max(max_abs_err, abs_err);
116+
const float denom = std::max(std::abs(golden[i]), 1e-6f);
117+
max_rel_err = std::max(max_rel_err, abs_err / denom);
118+
}
119+
printf(
120+
"Max abs error: %e Max rel error: %e (%zu elements)\n",
121+
max_abs_err,
122+
max_rel_err,
123+
golden.size());
124+
// Lenient gate: pass iff abs<=tol OR rel<=tol (near-zero goldens).
125+
if (max_abs_err > 1e-3f && max_rel_err > 1e-3f) {
126+
printf("FAIL: dispatch_order[%s] exceeds tolerance 1e-3\n", tc.name);
127+
return false;
128+
}
129+
printf("PASS: dispatch_order[%s]\n", tc.name);
130+
return true;
131+
}
132+
133+
} // namespace
134+
135+
int main(int argc, char** argv) {
136+
std::string dir = "/tmp/dispatch_order";
137+
if (argc > 1) {
138+
dir = argv[1];
139+
}
140+
if (const char* env = std::getenv("WEBGPU_DISPATCH_ORDER_DIR")) {
141+
dir = env;
142+
}
143+
144+
WebGPUContext ctx;
145+
try {
146+
ctx = create_webgpu_context();
147+
} catch (const std::exception& e) {
148+
printf("SKIP: %s\n", e.what());
149+
return 0;
150+
}
151+
set_default_webgpu_context(&ctx);
152+
printf("WebGPU device acquired (native); case dir: %s\n", dir.c_str());
153+
154+
bool ok = true;
155+
for (const auto& tc : kCases) {
156+
ok = run_case(dir, tc) && ok;
157+
}
158+
159+
set_default_webgpu_context(nullptr);
160+
destroy_webgpu_context(ctx);
161+
162+
if (!ok) {
163+
return 1;
164+
}
165+
printf("\nAll dispatch_order tests passed\n");
166+
return 0;
167+
}

0 commit comments

Comments
 (0)