pytorch · JulianCloudNTH · Jun 13, 2026
@@ -40,6 +40,7 @@ set(WEBGPU_SRCS
     runtime/ops/quantized_linear/QuantizedLinear.cpp
     runtime/ops/embedding_q4gsw/EmbeddingQ4gsw.cpp
     runtime/ops/rope/RotaryEmbedding.cpp
+    runtime/ops/prepack/Prepack.cpp
 )
 
 add_library(webgpu_backend ${WEBGPU_SRCS})

diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp
@@ -454,6 +454,23 @@ void WebGPUGraph::build(
       webgpu_operator_registry().get_op_fn(op_name)(*this, args);
     }
   }
+
+  // Phase 4: one-time constant-prepack copies (mirrors Vulkan prepack phase).
+  // No poll (Dawn lacks wgpuDevicePoll); queue order syncs it before execute().
+  if (!prepack_copies_.empty()) {
+    WGPUCommandEncoderDescriptor enc_desc = {};
+    WGPUCommandEncoder encoder =
+        wgpuDeviceCreateCommandEncoder(device_, &enc_desc);
+    for (const auto& c : prepack_copies_) {
+      wgpuCommandEncoderCopyBufferToBuffer(
+          encoder, c.src, 0, c.dst, 0, c.nbytes);
+    }
+    WGPUCommandBufferDescriptor cmd_desc = {};
+    WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc);
+    wgpuQueueSubmit(queue_, 1, &cmd);
+    wgpuCommandBufferRelease(cmd);
+    wgpuCommandEncoderRelease(encoder);
+  }
 }
 
 WGPUShaderModule WebGPUGraph::get_or_create_shader(

diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h
@@ -50,6 +50,13 @@ struct OutputCopy {
   size_t nbytes = 0;
 };
 
+// One-time constant-prepack buffer->buffer copy, run at the end of build().
+struct PrepackCopy {
+  WGPUBuffer src = nullptr; // non-owning: owned by tensors_[], freed in dtor
+  WGPUBuffer dst = nullptr; // non-owning: owned by tensors_[], freed in dtor
+  size_t nbytes = 0;
+};
+
 struct ExecuteConfig {
   size_t chunk_size = 0;
   size_t initial_chunk_size = 0;
@@ -180,6 +187,11 @@ class WebGPUGraph {
     dispatches_.push_back(dispatch);
   }
 
+  // Record a constant-prepack copy, executed once at the end of build().
+  void add_prepack_copy(WGPUBuffer src, WGPUBuffer dst, size_t nbytes) {
+    prepack_copies_.push_back({src, dst, nbytes});
+  }
+
   void add_uniform_buffer_bytes(size_t bytes) {
     uniform_buffer_bytes_ += bytes;
   }
@@ -286,6 +298,9 @@ class WebGPUGraph {
 
   std::vector<WebGPUDispatch> dispatches_;
 
+  // Constant-prepack copies, executed once at the end of build().
+  std::vector<PrepackCopy> prepack_copies_;
+
   ExecuteConfig execute_config_;
 
   // Caches for reusing GPU objects across dispatches.

diff --git a/backends/webgpu/runtime/ops/prepack/Prepack.cpp b/backends/webgpu/runtime/ops/prepack/Prepack.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/webgpu/runtime/WebGPUGraph.h>
+#include <executorch/backends/webgpu/runtime/ops/OperatorRegistry.h>
+
+#include <stdexcept>
+
+namespace executorch::backends::webgpu {
+
+namespace {
+
+// Materialize a constant to its GPU buffer: a dtype-agnostic byte copy.
+void prepack_impl(WebGPUGraph& graph, const std::vector<int>& args) {
+  // et_vk.prepack.default args: [src (constant), out].
+  if (args.size() != 2) {
+    throw std::runtime_error("WebGPU prepack: expected 2 args (src, out)");
+  }
+  const auto& src = graph.get_tensor(args.at(0));
+  const auto& out = graph.get_tensor(args.at(1));
+
+  if (src.dims != out.dims) {
+    throw std::runtime_error("WebGPU prepack: src/out shape mismatch");
+  }
+  if (src.elem_size != out.elem_size) {
+    throw std::runtime_error(
+        "WebGPU prepack: src/out dtype mismatch (cast unsupported)");
+  }
+  if (src.nbytes != out.nbytes) {
+    throw std::runtime_error("WebGPU prepack: src/out byte-size mismatch");
+  }
+  if (src.buffer == nullptr || out.buffer == nullptr) {
+    throw std::runtime_error("WebGPU prepack: null buffer binding");
+  }
+
+  graph.add_prepack_copy(src.buffer, out.buffer, out.nbytes);
+}
+
+} // namespace
+
+WEBGPU_REGISTER_OPERATORS {
+  WEBGPU_REGISTER_OP(et_vk.prepack.default, prepack_impl);
+}
+
+} // namespace executorch::backends::webgpu
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -53,6 +53,10 @@ EMBEDDING_GOLDEN="/tmp/webgpu_embedding_q4gsw_golden.bin"
 ROPE_MODEL="/tmp/webgpu_rope.pte"
 ROPE_XQ_GOLDEN="/tmp/webgpu_rope_xq_golden.bin"
 ROPE_XK_GOLDEN="/tmp/webgpu_rope_xk_golden.bin"
+PREPACK_MODEL="/tmp/webgpu_prepack.pte"
+PREPACK_GOLDEN="/tmp/webgpu_prepack_golden.bin"
+PREPACK2_MODEL="/tmp/webgpu_prepack_mul_add.pte"
+PREPACK2_GOLDEN="/tmp/webgpu_prepack_mul_add_golden.bin"
 
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model
@@ -75,6 +79,12 @@ from executorch.backends.webgpu.test.ops.rope.test_rope import export_rope_model
 export_rope_model('${ROPE_MODEL}', '${ROPE_XQ_GOLDEN}', '${ROPE_XK_GOLDEN}')
 " || echo "WARN: rope export failed; webgpu_native_test apply_rotary_emb case self-skips"
 
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.prepack.test_prepack import export_prepack_model, export_prepack_mul_add_model
+export_prepack_model('${PREPACK_MODEL}', '${PREPACK_GOLDEN}')
+export_prepack_mul_add_model('${PREPACK2_MODEL}', '${PREPACK2_GOLDEN}')
+" || echo "WARN: prepack export failed; webgpu_native_test prepack cases self-skip"
+
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
 export_rms_norm_cases('${RMS_NORM_DIR}')
@@ -171,6 +181,10 @@ if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
       WEBGPU_TEST_ROPE_MODEL="${ROPE_MODEL}" \
       WEBGPU_TEST_ROPE_XQ_GOLDEN="${ROPE_XQ_GOLDEN}" \
       WEBGPU_TEST_ROPE_XK_GOLDEN="${ROPE_XK_GOLDEN}" \
+      WEBGPU_TEST_PREPACK_MODEL="${PREPACK_MODEL}" \
+      WEBGPU_TEST_PREPACK_GOLDEN="${PREPACK_GOLDEN}" \
+      WEBGPU_TEST_PREPACK2_MODEL="${PREPACK2_MODEL}" \
+      WEBGPU_TEST_PREPACK2_GOLDEN="${PREPACK2_GOLDEN}" \
       "${BIN_DIR}/webgpu_native_test"
 else
   echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"

diff --git a/backends/webgpu/test/ops/prepack/__init__.py b/backends/webgpu/test/ops/prepack/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/prepack/test_prepack.py b/backends/webgpu/test/ops/prepack/test_prepack.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Constant-tensor prepack (`et_vk.prepack`) export + golden for the WebGPU
+backend.
+
+The VulkanPartitioner wraps every constant feeding a delegated op in an
+`et_vk.prepack.default` node that materializes the constant into a GPU buffer at
+init. Model `M(x) = x + w` (w a constant) routes `w` through prepack, so the
+delegate must run the prepack copy for the output to equal `x + w` rather than
+`x + 0 = x`. The input is a deterministic /16 ramp so the native binary
+reconstructs it bit-for-bit; the torch-computed golden is written for the native
+binary to compare (it has no ATen).
+"""
+
+import unittest
+
+import torch
+
+import executorch.backends.vulkan.custom_ops_lib  # noqa: F401
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+
+# 4x4 constant weight, small enough to dump and reason about by hand.
+N = 4
+
+
+class _AddConst(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        # arange weight: non-zero everywhere so an unrun prepack (out = x + 0 = x)
+        # is unambiguously distinguishable from a correct one (out = x + w).
+        self.w = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x + self.w
+
+
+class _MulAddConst(torch.nn.Module):
+    # Two constants (w1, w2) => two prepack nodes, exercising the multi-copy
+    # path E2E Llama relies on (many prepacked weights).
+    def __init__(self) -> None:
+        super().__init__()
+        self.w1 = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N)
+        )
+        self.w2 = torch.nn.Parameter(
+            torch.arange(N * N, dtype=torch.float32).reshape(N, N) * 0.5 - 3.0
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.w1 + self.w2
+
+
+def _inputs() -> tuple[torch.Tensor]:
+    # ((i % 13) - 6) / 16: exact in fp32, matches test_webgpu_native.cpp.
+    idx = torch.arange(N * N, dtype=torch.int64)
+    x = (((idx % 13) - 6).to(torch.float32) / 16.0).reshape(N, N)
+    return (x,)
+
+
+def _export(model, inputs):
+    ep = torch.export.export(model.eval(), inputs)
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class TestPrepack(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        et = _export(_AddConst(), _inputs())
+        found = any(
+            d.id == "VulkanBackend"
+            for plan in et.executorch_program.execution_plan
+            for d in plan.delegates
+        )
+        self.assertTrue(found, "Expected a VulkanBackend delegate (x + w fusion)")
+
+
+def _write(model, pte_path: str, golden_path: str) -> None:
+    (x,) = _inputs()
+    golden = model.eval()(x)
+    et = _export(model, (x,))
+    with open(pte_path, "wb") as f:
+        f.write(et.buffer)
+    golden.detach().numpy().astype("<f4").tofile(golden_path)
+    print(f"Exported {pte_path}; golden {golden_path} ({golden.numel()} floats)")
+
+
+def export_prepack_model(pte_path: str, golden_path: str) -> None:
+    """Write the x + w .pte + torch golden (raw LE fp32). One prepacked constant.
+    The input is a /16 ramp reconstructed in the native test."""
+    _write(_AddConst(), pte_path, golden_path)
+
+
+def export_prepack_mul_add_model(pte_path: str, golden_path: str) -> None:
+    """Write the x * w1 + w2 .pte + golden. Two prepacked constants, exercising
+    the multi-copy path."""
+    _write(_MulAddConst(), pte_path, golden_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
@@ -633,6 +633,76 @@ static bool test_rope(
   return true;
 }
 
+static bool test_prepack(
+    const std::string& model_path,
+    const std::string& golden_path,
+    const std::string& label = "x + const w") {
+  // et_vk.prepack copy vs golden; unrun copy leaves zeros. See test_prepack.py.
+  constexpr int n = 4;
+  constexpr int numel = n * n;
+  printf("\n--- Test: prepack (%s, %dx%d) ---\n", label.c_str(), n, n);
+
+  Module module(model_path);
+  auto err = module.load_forward();
+  if (err != Error::Ok) {
+    printf("FAIL: could not load forward method (error %d)\n", (int)err);
+    return false;
+  }
+  printf("Model loaded: %s\n", model_path.c_str());
+
+  std::vector<float> golden = load_golden(golden_path, numel);
+  if (golden.empty()) {
+    printf("FAIL: could not load golden %s\n", golden_path.c_str());
+    return false;
+  }
+
+  // ((i % 13) - 6) / 16: exact in fp32, matches test_prepack.py::_inputs.
+  std::vector<float> x_data(numel);
+  for (int i = 0; i < numel; i++) {
+    x_data[i] = static_cast<float>((i % 13) - 6) / 16.0f;
+  }
+  auto x = make_tensor_ptr({n, n}, std::vector<float>(x_data));
+
+  auto result = module.forward({EValue(x)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.numel() != numel) {
+    printf(
+        "FAIL: output numel %zu != expected %d\n",
+        (size_t)out_tensor.numel(),
+        numel);
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  float max_abs_err = 0.0f, max_rel_err = 0.0f;
+  for (int i = 0; i < numel; i++) {
+    const float ae = std::abs(out_data[i] - golden[i]);
+    max_abs_err = std::max(max_abs_err, ae);
+    max_rel_err =
+        std::max(max_rel_err, ae / std::max(std::abs(golden[i]), 1e-6f));
+  }
+  printf(
+      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
+      max_abs_err,
+      max_rel_err,
+      numel);
+  if (max_abs_err > 1e-3f || max_rel_err > 1e-3f) {
+    printf("FAIL: prepack exceeds tolerance 1e-3\n");
+    return false;
+  }
+  printf("PASS: prepack test\n");
+  return true;
+}
+
 // Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden.
 static bool test_q4gsw_config(
     const Q4gswConfig& cfg,
@@ -1681,6 +1751,22 @@ int main(int argc, char** argv) {
     rope_xk_golden_path = env;
   }
 
+  std::string prepack_model_path, prepack_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_MODEL")) {
+    prepack_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK_GOLDEN")) {
+    prepack_golden_path = env;
+  }
+
+  std::string prepack2_model_path, prepack2_golden_path;
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_MODEL")) {
+    prepack2_model_path = env;
+  }
+  if (const char* env = std::getenv("WEBGPU_TEST_PREPACK2_GOLDEN")) {
+    prepack2_golden_path = env;
+  }
+
   // SDPA sweep: configs self-discover their sdpa_<name>.pte/.golden.bin under
   // this directory (default "" = the embedded-file root / cwd). Set
   // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/).
@@ -1747,6 +1833,16 @@ int main(int argc, char** argv) {
         ok;
   }
 
+  if (!prepack_model_path.empty() && !prepack_golden_path.empty()) {
+    ok = test_prepack(prepack_model_path, prepack_golden_path) && ok;
+  }
+
+  if (!prepack2_model_path.empty() && !prepack2_golden_path.empty()) {
+    ok = test_prepack(
+             prepack2_model_path, prepack2_golden_path, "x * w1 + w2") &&
+        ok;
+  }
+
   bool sdpa_ran = false;
   bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran);
   if (sdpa_ran) {