Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/webgpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ set(WEBGPU_SRCS
runtime/ops/quantized_linear/QuantizedLinear.cpp
runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
runtime/ops/rope/RotaryEmbedding.cpp
runtime/ops/prepack/Prepack.cpp
)

add_library(webgpu_backend ${WEBGPU_SRCS})
Expand Down
17 changes: 17 additions & 0 deletions backends/webgpu/runtime/WebGPUGraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,23 @@ void WebGPUGraph::build(
webgpu_operator_registry().get_op_fn(op_name)(*this, args);
}
}

// Phase 4: one-time constant-prepack copies (mirrors Vulkan prepack phase).
// No poll (Dawn lacks wgpuDevicePoll); queue order syncs it before execute().
if (!prepack_copies_.empty()) {
WGPUCommandEncoderDescriptor enc_desc = {};
WGPUCommandEncoder encoder =
wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
for (const auto& c : prepack_copies_) {
wgpuCommandEncoderCopyBufferToBuffer(
encoder, c.src, 0, c.dst, 0, c.nbytes);
}
WGPUCommandBufferDescriptor cmd_desc = {};
WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
wgpuQueueSubmit(queue_, 1, &cmd);
wgpuCommandBufferRelease(cmd);
wgpuCommandEncoderRelease(encoder);
}
}

WGPUShaderModule WebGPUGraph::get_or_create_shader(
Expand Down
15 changes: 15 additions & 0 deletions backends/webgpu/runtime/WebGPUGraph.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ struct OutputCopy {
size_t nbytes = 0;
};

// One-time constant-prepack buffer->buffer copy, run at the end of build().
struct PrepackCopy {
WGPUBuffer src = nullptr; // non-owning: owned by tensors_[], freed in dtor
WGPUBuffer dst = nullptr; // non-owning: owned by tensors_[], freed in dtor
size_t nbytes = 0;
};

struct ExecuteConfig {
size_t chunk_size = 0;
size_t initial_chunk_size = 0;
Expand Down Expand Up @@ -180,6 +187,11 @@ class WebGPUGraph {
dispatches_.push_back(dispatch);
}

// Record a constant-prepack copy, executed once at the end of build().
void add_prepack_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
prepack_copies_.push_back({src, dst, nbytes});
}

void add_uniform_buffer_bytes(size_t bytes) {
uniform_buffer_bytes_ += bytes;
}
Expand Down Expand Up @@ -286,6 +298,9 @@ class WebGPUGraph {

std::vector<WebGPUDispatch> dispatches_;

// Constant-prepack copies, executed once at the end of build().
std::vector<PrepackCopy> prepack_copies_;

ExecuteConfig execute_config_;

// Caches for reusing GPU objects across dispatches.
Expand Down
50 changes: 50 additions & 0 deletions backends/webgpu/runtime/ops/prepack/Prepack.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>

#include <stdexcept>

namespace executorch::backends::webgpu {

namespace {

// Materialize a constant to its GPU buffer: a dtype-agnostic byte copy.
void prepack_impl(WebGPUGraph& graph, const std::vector<int>& args) {
// et_vk.prepack.default args: [src (constant), out].
if (args.size() != 2) {
throw std::runtime_error("WebGPU prepack: expected 2 args (src, out)");
}
const auto& src = graph.get_tensor(args.at(0));
const auto& out = graph.get_tensor(args.at(1));

if (src.dims != out.dims) {
throw std::runtime_error("WebGPU prepack: src/out shape mismatch");
}
if (src.elem_size != out.elem_size) {
throw std::runtime_error(
"WebGPU prepack: src/out dtype mismatch (cast unsupported)");
}
if (src.nbytes != out.nbytes) {
throw std::runtime_error("WebGPU prepack: src/out byte-size mismatch");
}
if (src.buffer == nullptr || out.buffer == nullptr) {
throw std::runtime_error("WebGPU prepack: null buffer binding");
}

graph.add_prepack_copy(src.buffer, out.buffer, out.nbytes);
}

} // namespace

WEBGPU_REGISTER_OPERATORS {
WEBGPU_REGISTER_OP(et_vk.prepack.default, prepack_impl);
}

} // namespace executorch::backends::webgpu
14 changes: 14 additions & 0 deletions backends/webgpu/scripts/test_webgpu_native_ci.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
ROPE_MODEL="/tmp/webgpu_rope.pte"
ROPE_XQ_GOLDEN="/tmp/webgpu_rope_xq_golden.bin"
ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin"
PREPACK_MODEL="/tmp/webgpu_prepack.pte"
PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin"
PREPACK2_MODEL="/tmp/webgpu_prepack_mul_add.pte"
PREPACK2_GOLDEN="/tmp/webgpu_prepack_mul_add_golden.bin"

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
Expand All @@ -75,6 +79,12 @@ from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model
export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}')
" || echo "WARN: rope export failed; webgpu_native_test apply_rotary_emb case self-skips"

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_mul_add_model
export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}')
export_prepack_mul_add_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}')
" || echo "WARN: prepack export failed; webgpu_native_test prepack cases self-skip"

$PYTHON_EXECUTABLE -c "
from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
export_rms_norm_cases('${RMS_NORM_DIR}')
Expand Down Expand Up @@ -171,6 +181,10 @@ if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
WEBGPU_TEST_ROPE_MODEL="${ROPE_MODEL}" \
WEBGPU_TEST_ROPE_XQ_GOLDEN="${ROPE_XQ_GOLDEN}" \
WEBGPU_TEST_ROPE_XK_GOLDEN="${ROPE_XK_GOLDEN}" \
WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \
WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \
WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \
WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \
"${BIN_DIR}/webgpu_native_test"
else
echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
Expand Down
5 changes: 5 additions & 0 deletions backends/webgpu/test/ops/prepack/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
108 changes: 108 additions & 0 deletions backends/webgpu/test/ops/prepack/test_prepack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Constant-tensor prepack (`et_vk.prepack`) export + golden for the WebGPU
backend.

The VulkanPartitioner wraps every constant feeding a delegated op in an
`et_vk.prepack.default` node that materializes the constant into a GPU buffer at
init. Model `M(x) = x + w` (w a constant) routes `w` through prepack, so the
delegate must run the prepack copy for the output to equal `x + w` rather than
`x + 0 = x`. The input is a deterministic /16 ramp so the native binary
reconstructs it bit-for-bit; the torch-computed golden is written for the native
binary to compare (it has no ATen).
"""

import unittest

import torch

import executorch.backends.vulkan.custom_ops_lib # noqa: F401
from executorch.backends.vulkan import VulkanPartitioner
from executorch.exir import to_edge_transform_and_lower

# 4x4 constant weight, small enough to dump and reason about by hand.
N = 4


class _AddConst(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
# arange weight: non-zero everywhere so an unrun prepack (out = x + 0 = x)
# is unambiguously distinguishable from a correct one (out = x + w).
self.w = torch.nn.Parameter(
torch.arange(N * N, dtype=torch.float32).reshape(N, N)
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
return x + self.w


class _MulAddConst(torch.nn.Module):
# Two constants (w1, w2) => two prepack nodes, exercising the multi-copy
# path E2E Llama relies on (many prepacked weights).
def __init__(self) -> None:
super().__init__()
self.w1 = torch.nn.Parameter(
torch.arange(N * N, dtype=torch.float32).reshape(N, N)
)
self.w2 = torch.nn.Parameter(
torch.arange(N * N, dtype=torch.float32).reshape(N, N) * 0.5 - 3.0
)

def forward(self, x: torch.Tensor) -> torch.Tensor:
return x * self.w1 + self.w2


def _inputs() -> tuple[torch.Tensor]:
# ((i % 13) - 6) / 16: exact in fp32, matches test_webgpu_native.cpp.
idx = torch.arange(N * N, dtype=torch.int64)
x = (((idx % 13) - 6).to(torch.float32) / 16.0).reshape(N, N)
return (x,)


def _export(model, inputs):
ep = torch.export.export(model.eval(), inputs)
return to_edge_transform_and_lower(
ep, partitioner=[VulkanPartitioner()]
).to_executorch()


class TestPrepack(unittest.TestCase):
def test_export_delegates(self) -> None:
et = _export(_AddConst(), _inputs())
found = any(
d.id == "VulkanBackend"
for plan in et.executorch_program.execution_plan
for d in plan.delegates
)
self.assertTrue(found, "Expected a VulkanBackend delegate (x + w fusion)")


def _write(model, pte_path: str, golden_path: str) -> None:
(x,) = _inputs()
golden = model.eval()(x)
et = _export(model, (x,))
with open(pte_path, "wb") as f:
f.write(et.buffer)
golden.detach().numpy().astype("<f4").tofile(golden_path)
print(f"Exported {pte_path}; golden {golden_path} ({golden.numel()} floats)")


def export_prepack_model(pte_path: str, golden_path: str) -> None:
"""Write the x + w .pte + torch golden (raw LE fp32). One prepacked constant.
The input is a /16 ramp reconstructed in the native test."""
_write(_AddConst(), pte_path, golden_path)


def export_prepack_mul_add_model(pte_path: str, golden_path: str) -> None:
"""Write the x * w1 + w2 .pte + golden. Two prepacked constants, exercising
the multi-copy path."""
_write(_MulAddConst(), pte_path, golden_path)


if __name__ == "__main__":
unittest.main()
96 changes: 96 additions & 0 deletions backends/webgpu/test/test_webgpu_native.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -633,6 +633,76 @@ static bool test_rope(
return true;
}

static bool test_prepack(
const std::string& model_path,
const std::string& golden_path,
const std::string& label = "x + const w") {
// et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py.
constexpr int n = 4;
constexpr int numel = n * n;
printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n);

Module module(model_path);
auto err = module.load_forward();
if (err != Error::Ok) {
printf("FAIL: could not load forward method (error %d)\n", (int)err);
return false;
}
printf("Model loaded: %s\n", model_path.c_str());

std::vector<float> golden = load_golden(golden_path, numel);
if (golden.empty()) {
printf("FAIL: could not load golden %s\n", golden_path.c_str());
return false;
}

// ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs.
std::vector<float> x_data(numel);
for (int i = 0; i < numel; i++) {
x_data[i] = static_cast<float>((i % 13) - 6) / 16.0f;
}
auto x = make_tensor_ptr({n, n}, std::vector<float>(x_data));

auto result = module.forward({EValue(x)});
if (!result.ok()) {
printf("FAIL: forward failed (error %d)\n", (int)result.error());
return false;
}
const auto& outputs = result.get();
if (outputs.empty() || !outputs[0].isTensor()) {
printf("FAIL: no tensor output\n");
return false;
}
const auto& out_tensor = outputs[0].toTensor();
if (out_tensor.numel() != numel) {
printf(
"FAIL: output numel %zu != expected %d\n",
(size_t)out_tensor.numel(),
numel);
return false;
}
const float* out_data = out_tensor.const_data_ptr<float>();

float max_abs_err = 0.0f, max_rel_err = 0.0f;
for (int i = 0; i < numel; i++) {
const float ae = std::abs(out_data[i] - golden[i]);
max_abs_err = std::max(max_abs_err, ae);
max_rel_err =
std::max(max_rel_err, ae / std::max(std::abs(golden[i]), 1e-6f));
}
printf(
"Max abs error: %e Max rel error: %e (checked %d elements)\n",
max_abs_err,
max_rel_err,
numel);
if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
printf("FAIL: prepack exceeds tolerance 1e-3\n");
return false;
}
printf("PASS: prepack test\n");
return true;
}

// Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden.
static bool test_q4gsw_config(
const Q4gswConfig& cfg,
Expand Down Expand Up @@ -1681,6 +1751,22 @@ int main(int argc, char** argv) {
rope_xk_golden_path = env;
}

std::string prepack_model_path, prepack_golden_path;
if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) {
prepack_model_path = env;
}
if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) {
prepack_golden_path = env;
}

std::string prepack2_model_path, prepack2_golden_path;
if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) {
prepack2_model_path = env;
}
if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) {
prepack2_golden_path = env;
}

// SDPA sweep: configs self-discover their sdpa_<name>.pte/.golden.bin under
// this directory (default "" = the embedded-file root / cwd). Set
// WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/).
Expand Down Expand Up @@ -1747,6 +1833,16 @@ int main(int argc, char** argv) {
ok;
}

if (!prepack_model_path.empty() && !prepack_golden_path.empty()) {
ok = test_prepack(prepack_model_path, prepack_golden_path) && ok;
}

if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) {
ok = test_prepack(
prepack2_model_path, prepack2_golden_path, "x * w1 + w2") &&
ok;
}

bool sdpa_ran = false;
bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran);
if (sdpa_ran) {
Expand Down
Loading