[ExecuTorch][WebGPU] linear_q4gsw test suite: Llama-1B shapes + 4k/8k sweep

JulianCloudNTH · JulianCloudNTH · commit d43568a331bb · 2026-06-12T17:33:59.000-07:00
Pull Request resolved: #20227 Adds the numerical test suite for `et_vk.linear_q4gsw` (stacked on the op diff), mirroring the SDPA test suite. A named CONFIGS sweep covers real Llama-3.2-1B linear shapes — q/o-proj (2048->2048), k/v-proj (2048->512), gate/up-proj (2048->8192), down-proj (8192->2048), lm_head (2048->128256) — plus 4k/8k large-token prefill (M=4096/8192 on the 2048->2048 and 2048->512 projections). `test/ops/quantized_linear/test_quantized_linear.py` exports each config's `.pte` + an fp64 dequant-matmul "truth" golden; `test/test_webgpu_native.cpp` reconstructs the deterministic ramp input bit-for-bit, runs the op on the GPU, and compares per element; `scripts/test_webgpu_native_ci.sh` wires the fixtures into the Dawn(Tint)+SwiftShader CI. ghstack-source-id: 392908895 @exported-using-ghexport Differential Revision: [D108314849](https://our.internmc.facebook.com/intern/diff/D108314849/)
diff --git a/backends/webgpu/scripts/test_webgpu_native_ci.sh b/backends/webgpu/scripts/test_webgpu_native_ci.sh
@@ -54,6 +54,11 @@ export_add_model('${PTE_MODEL}')
 export_chained_add_model('${PTE_CHAINED_MODEL}')
 " || echo "WARN: add export failed; webgpu_native_test self-skips models whose .pte is absent"
 
+$PYTHON_EXECUTABLE -c "
+from executorch.backends.webgpu.test.ops.quantized_linear.test_quantized_linear import export_all_quantized_linear_models
+export_all_quantized_linear_models('/tmp')
+" || echo "WARN: q4gsw export failed; required configs will FAIL in webgpu_native_test"
+
 $PYTHON_EXECUTABLE -c "
 from executorch.backends.webgpu.test.ops.rms_norm.test_rms_norm import export_rms_norm_cases
 export_rms_norm_cases('${RMS_NORM_DIR}')
@@ -143,6 +148,7 @@ if [[ -x "${BIN_DIR}/webgpu_native_test" && -f "${PTE_MODEL}" ]]; then
   env WEBGPU_TEST_MODEL="${PTE_MODEL}" \
       WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \
       WEBGPU_TEST_SDPA_DIR=/tmp/ \
+      WEBGPU_TEST_QUANTIZED_LINEAR_DIR=/tmp/ \
       "${BIN_DIR}/webgpu_native_test"
 else
   echo "(skipping webgpu_native_test: no exported .pte — needs the executorch python wheel)"
diff --git a/backends/webgpu/test/ops/quantized_linear/__init__.py b/backends/webgpu/test/ops/quantized_linear/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py b/backends/webgpu/test/ops/quantized_linear/test_quantized_linear.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""4-bit weight-only quantized linear (`et_vk.linear_q4gsw`) export + fp64 golden.
+
+Mirrors test_sdpa.py: a named CONFIGS sweep over real Llama-3.2-1B linear shapes
+(q/o/k/v/gate/up/down proj + lm_head) plus large-M (4k/8k) prefill stress, each
+exported through VulkanPartitioner (which fuses dq+linear into
+`et_vk.linear_q4gsw.default`). The golden is the fp64 dequant-matmul truth
+(x @ dequant(W).T), so the GPU's fp32 error is measured against truth, not another
+fp32 approximation. The native test (test_webgpu_native.cpp) mirrors the same
+CONFIGS table and reconstructs the identical deterministic ramp input bit-for-bit.
+"""
+
+import os
+import unittest
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+
+from executorch.backends.vulkan import VulkanPartitioner
+from executorch.exir import to_edge_transform_and_lower
+from torchao.quantization.granularity import PerGroup
+from torchao.quantization.quant_api import IntxWeightOnlyConfig, quantize_
+
+
+@dataclass(frozen=True)
+class Q4gswConfig:
+    name: str
+    m: int  # rows (tokens)
+    k: int  # in_features (reduction dim)
+    n: int  # out_features
+    group_size: int = 32  # K % group_size == 0, K % 8 == 0, N % 8 == 0
+    # heavy = huge fixture / slow on a CPU rasterizer; export_all skips unless asked.
+    heavy: bool = False
+
+
+# Single source of truth, mirrored by the C++ kQ4gswConfigs table. Llama-3.2-1B:
+# hidden=2048, n_heads=32 head_dim=64 (q/o=2048->2048), n_kv=8 (k/v=2048->512),
+# FFN=8192 (gate/up=2048->8192), down=8192->2048, vocab=128256 (lm_head).
+CONFIGS = [
+    # name              M     K       N
+    Q4gswConfig("q_proj", 1, 2048, 2048),  # also covers o_proj (same shape)
+    Q4gswConfig("kv_proj", 1, 2048, 512),  # k_proj / v_proj
+    Q4gswConfig("gate_proj", 1, 2048, 8192),  # gate_proj / up_proj
+    Q4gswConfig("down_proj", 1, 8192, 2048),  # big reduction K
+    Q4gswConfig("lm_head", 1, 2048, 128256, heavy=True),  # 131MB packed .pte
+    Q4gswConfig("q_proj_4k", 4096, 2048, 2048),  # 4k-token prefill
+    Q4gswConfig("kv_proj_4k", 4096, 2048, 512),
+    Q4gswConfig("q_proj_8k", 8192, 2048, 2048, heavy=True),  # 67MB golden
+    Q4gswConfig("kv_proj_8k", 8192, 2048, 512, heavy=True),
+]
+
+
+def _make_quantized_model(k: int, n: int, group_size: int) -> torch.nn.Module:
+    torch.manual_seed(0)  # load-bearing: fixes the weights the golden derives from
+    m = torch.nn.Linear(k, n, bias=False).eval()
+    quantize_(
+        m,
+        IntxWeightOnlyConfig(weight_dtype=torch.int4, granularity=PerGroup(group_size)),
+    )
+    return m
+
+
+def _ramp_input(m_rows: int, k: int) -> torch.Tensor:
+    """Deterministic fp32 input [M,K]; C++ q4gsw_ramp reconstructs it bit-for-bit.
+
+    x[flat] = ((flat % 17) - 8) / 16 over the flat row-major index -- exact in fp32
+    (small modulus, power-of-two denominator).
+    """
+    flat = np.arange(m_rows * k, dtype=np.int64)
+    x = ((flat % 17) - 8).astype(np.float32) / np.float32(16.0)
+    return torch.from_numpy(x).reshape(m_rows, k)
+
+
+def _fp64_golden(m: torch.nn.Module, x: torch.Tensor) -> np.ndarray:
+    """fp64 truth: x @ dequant(W).T. The kernel computes the same dequant-matmul, so
+    fp64 makes this the true answer -- GPU fp32 error is measured vs truth, not vs a
+    second fp32 approximation. torchao handles the signed-nibble recovery in dequantize().
+    """
+    wq = m.weight.dequantize()  # AffineQuantizedTensor -> dequantized weight [N,K]
+    golden = x.double() @ wq.double().t()  # [M,N] in fp64
+    return golden.to(torch.float32).numpy().astype("<f4")
+
+
+def _export(m: torch.nn.Module, x: torch.Tensor):
+    ep = torch.export.export(m, (x,))
+    return to_edge_transform_and_lower(
+        ep, partitioner=[VulkanPartitioner()]
+    ).to_executorch()
+
+
+class TestQuantizedLinear(unittest.TestCase):
+    def test_export_delegates(self) -> None:
+        # Each (non-heavy) config must fuse to a VulkanBackend delegate (q4gsw);
+        # fusion is shape-independent, so skipping the heavy 131MB+ fixtures is free.
+        for cfg in CONFIGS:
+            if cfg.heavy:
+                continue
+            with self.subTest(config=cfg.name):
+                m = _make_quantized_model(cfg.k, cfg.n, cfg.group_size)
+                et = _export(m, _ramp_input(1, cfg.k))
+                found = any(
+                    d.id == "VulkanBackend"
+                    for plan in et.executorch_program.execution_plan
+                    for d in plan.delegates
+                )
+                self.assertTrue(found, f"no VulkanBackend delegate in {cfg.name}")
+
+    def test_golden_matches_eager(self) -> None:
+        # Dual oracle (mirrors SDPA test_golden_matches_eager_op): the fp64 dequant-
+        # matmul truth and torchao's own fp32 quantized forward are independent refs
+        # that must agree -- guards a bug in the fp64 oracle / dequantize() accessor.
+        # M=1 non-heavy shapes (cheap; the math is shape-independent).
+        for cfg in CONFIGS:
+            if cfg.m != 1 or cfg.heavy:
+                continue
+            with self.subTest(config=cfg.name):
+                m = _make_quantized_model(cfg.k, cfg.n, cfg.group_size)
+                x = _ramp_input(1, cfg.k)
+                golden = torch.from_numpy(_fp64_golden(m, x))
+                torch.testing.assert_close(m(x), golden, atol=5e-4, rtol=1e-3)
+
+
+def export_quantized_linear_model(
+    cfg: Q4gswConfig, pte_path: str, golden_path: str
+) -> None:
+    """Export one config's q4gsw .pte + its fp64 golden (raw LE fp32)."""
+    m = _make_quantized_model(cfg.k, cfg.n, cfg.group_size)
+    x = _ramp_input(cfg.m, cfg.k)
+    et = _export(m, x)
+    with open(pte_path, "wb") as f:
+        f.write(et.buffer)
+    _fp64_golden(m, x).tofile(golden_path)
+    print(f"Exported {pte_path}; golden {golden_path} ({cfg.m * cfg.n} floats)")
+
+
+def export_all_quantized_linear_models(
+    out_dir: str, include_heavy: bool = False
+) -> None:
+    """Write q4gsw_<name>.pte + q4gsw_<name>.golden.bin for each config.
+
+    Heavy configs (lm_head 131MB .pte; M=8k 67MB goldens) are skipped unless
+    include_heavy -- plain CI never writes them; a real-GPU run opts in.
+    """
+    for cfg in CONFIGS:
+        if cfg.heavy and not include_heavy:
+            print(f"(skipping heavy config {cfg.name}; set include_heavy=True)")
+            continue
+        pte = os.path.join(out_dir, f"q4gsw_{cfg.name}.pte")
+        golden = os.path.join(out_dir, f"q4gsw_{cfg.name}.golden.bin")
+        export_quantized_linear_model(cfg, pte, golden)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp
@@ -375,6 +375,166 @@ static bool sdpa_within_tol(
   return ok;
 }
 
+// linear_q4gsw sweep config; mirrors CONFIGS in test_quantized_linear.py.
+struct Q4gswConfig {
+  const char* name;
+  int m; // rows (tokens)
+  int k; // in_features (reduction dim)
+  int n; // out_features
+  float tol_abs; // per-element abs gate
+  float tol_rel; // per-element rel gate
+  bool required; // dir set + .pte absent => FAIL (not skip)
+  bool heavy; // huge/slow: export-gated; runs only if WEBGPU_TEST_HEAVY
+};
+
+// Llama-3.2-1B linear shapes (q/o/k/v/gate/up/down + lm_head) + 4k/8k prefill.
+// tol scales with K (fp32 accum depth), not M; down_proj (K=8192) is looser.
+static const Q4gswConfig kQ4gswConfigs[] = {
+    // name         M     K     N      tol_abs tol_rel req    heavy
+    {"q_proj", 1, 2048, 2048, 1e-4f, 1e-3f, true, false},
+    {"kv_proj", 1, 2048, 512, 1e-4f, 1e-3f, true, false},
+    {"gate_proj", 1, 2048, 8192, 1e-4f, 1e-3f, true, false},
+    {"down_proj", 1, 8192, 2048, 1e-3f, 1e-2f, true, false}, // big-K accum
+    {"lm_head", 1, 2048, 128256, 1e-4f, 1e-3f, false, true},
+    {"q_proj_4k", 4096, 2048, 2048, 1e-4f, 1e-3f, true, false},
+    {"kv_proj_4k", 4096, 2048, 512, 1e-4f, 1e-3f, true, false},
+    {"q_proj_8k", 8192, 2048, 2048, 1e-4f, 1e-3f, false, true},
+    {"kv_proj_8k", 8192, 2048, 512, 1e-4f, 1e-3f, false, true},
+};
+
+// /16 ramp over the flat index; mirrors test_quantized_linear.py _ramp_input.
+static float q4gsw_ramp(int i) {
+  return static_cast<float>((i % 17) - 8) / 16.0f;
+}
+
+// Per-element dual tolerance (abs OR rel), parameterized like sdpa_within_tol.
+static bool quant_within_tol(
+    const float* out,
+    const float* golden,
+    int n,
+    float atol,
+    float rtol,
+    float* ma,
+    float* mr) {
+  float max_abs = 0.0f, max_rel = 0.0f;
+  bool ok = true;
+  for (int i = 0; i < n; i++) {
+    const float ae = std::abs(out[i] - golden[i]);
+    const float re = ae / std::max(std::abs(golden[i]), 1e-6f);
+    max_abs = std::max(max_abs, ae);
+    max_rel = std::max(max_rel, re);
+    if (ae > atol && re > rtol) {
+      ok = false;
+    }
+  }
+  *ma = max_abs;
+  *mr = max_rel;
+  return ok;
+}
+
+// Reconstruct _ramp_input bit-for-bit, run the op, compare to the fp64 golden.
+static bool test_q4gsw_config(
+    const Q4gswConfig& cfg,
+    const std::string& pte,
+    const std::string& golden_path) {
+  printf(
+      "\n--- Test: linear_q4gsw (%s: M=%d,K=%d,N=%d) ---\n",
+      cfg.name,
+      cfg.m,
+      cfg.k,
+      cfg.n);
+
+  Module module(pte);
+  if (module.load_forward() != Error::Ok) {
+    printf("FAIL: could not load %s\n", pte.c_str());
+    return false;
+  }
+
+  const int in_numel = cfg.m * cfg.k;
+  const int out_numel = cfg.m * cfg.n;
+  std::vector<float> input(in_numel);
+  for (int i = 0; i < in_numel; i++) {
+    input[i] = q4gsw_ramp(i);
+  }
+
+  auto x = make_tensor_ptr({cfg.m, cfg.k}, std::vector<float>(input));
+  auto result = module.forward({EValue(x)});
+  if (!result.ok()) {
+    printf("FAIL: forward failed (error %d)\n", (int)result.error());
+    return false;
+  }
+  const auto& outputs = result.get();
+  if (outputs.empty() || !outputs[0].isTensor()) {
+    printf("FAIL: no tensor output\n");
+    return false;
+  }
+  const auto& out_tensor = outputs[0].toTensor();
+  if (out_tensor.numel() != out_numel) {
+    printf(
+        "FAIL: output numel %zu != expected %d\n",
+        (size_t)out_tensor.numel(),
+        out_numel);
+    return false;
+  }
+  const float* out_data = out_tensor.const_data_ptr<float>();
+
+  std::vector<float> golden = load_golden(golden_path, out_numel);
+  if (golden.empty()) {
+    printf("FAIL: could not load golden %s\n", golden_path.c_str());
+    return false;
+  }
+
+  float ma = 0.0f, mr = 0.0f;
+  const bool pass = quant_within_tol(
+      out_data, golden.data(), out_numel, cfg.tol_abs, cfg.tol_rel, &ma, &mr);
+  printf(
+      "Max abs error: %e   Max rel error: %e (checked %d elements)\n",
+      ma,
+      mr,
+      out_numel);
+  if (!pass) {
+    printf(
+        "FAIL: linear_q4gsw %s exceeds tolerance (abs %g OR rel %g)\n",
+        cfg.name,
+        cfg.tol_abs,
+        cfg.tol_rel);
+    return false;
+  }
+  printf("PASS: linear_q4gsw %s\n", cfg.name);
+  return true;
+}
+
+// q4gsw sweep: self-discover q4gsw_<name>.pte; required=FAIL, heavy=gate, *ran.
+static bool test_q4gsw_sweep(const std::string& dir, bool* ran) {
+  bool ok = true;
+  const bool heavy_run = std::getenv("WEBGPU_TEST_HEAVY") != nullptr;
+  for (const auto& cfg : kQ4gswConfigs) {
+    const std::string pte = dir + "q4gsw_" + cfg.name + ".pte";
+    FILE* f = std::fopen(pte.c_str(), "rb");
+    if (!f) {
+      if (cfg.required && !dir.empty()) {
+        printf(
+            "FAIL: required q4gsw config %s has no .pte in %s\n",
+            cfg.name,
+            dir.c_str());
+        ok = false;
+      }
+      continue;
+    }
+    std::fclose(f);
+    if (cfg.heavy && !heavy_run) {
+      printf(
+          "SKIP: heavy q4gsw config %s (set WEBGPU_TEST_HEAVY=1 on a real GPU)\n",
+          cfg.name);
+      continue;
+    }
+    const std::string golden = dir + "q4gsw_" + cfg.name + ".golden.bin";
+    *ran = true;
+    ok = test_q4gsw_config(cfg, pte, golden) && ok;
+  }
+  return ok;
+}
+
 // Fused sdpa_with_kv_cache sweep config. Mirrors the Python CONFIGS table in
 // test_sdpa.py exactly (name, Hq, Hkv, D, S, Cmax, input_pos).
 struct SdpaConfig {
@@ -1289,6 +1449,15 @@ int main(int argc, char** argv) {
     update_cache_model_path = env;
   }
 
+  // Quantized-linear sweep dir (mirrors WEBGPU_TEST_SDPA_DIR).
+  std::string qlinear_dir;
+  if (const char* env = std::getenv("WEBGPU_TEST_QUANTIZED_LINEAR_DIR")) {
+    qlinear_dir = env;
+    if (!qlinear_dir.empty() && qlinear_dir.back() != '/') {
+      qlinear_dir += '/';
+    }
+  }
+
   // SDPA sweep: configs self-discover their sdpa_<name>.pte/.golden.bin under
   // this directory (default "" = the embedded-file root / cwd). Set
   // WEBGPU_TEST_SDPA_DIR to point at the exported .pte directory (e.g. /tmp/).
@@ -1326,6 +1495,22 @@ int main(int argc, char** argv) {
     ok = test_update_cache(update_cache_model_path) && ok;
   }
 
+  bool q4gsw_ran = false;
+  bool q4gsw_ok = test_q4gsw_sweep(qlinear_dir, &q4gsw_ran);
+  if (q4gsw_ran) {
+    ok = q4gsw_ok && ok;
+  }
+  // Guard python<->C++ ramp bit-identity: q4gsw_ramp(0) = -0.5 exactly.
+  if (std::abs(q4gsw_ramp(0) - (-0.5f)) > 1e-12f) {
+    printf("FAIL: q4gsw_ramp bit-identity check\n");
+    ok = false;
+  }
+  if (!qlinear_dir.empty() && !q4gsw_ran) {
+    printf(
+        "FAIL: WEBGPU_TEST_QUANTIZED_LINEAR_DIR set but no q4gsw config ran\n");
+    ok = false;
+  }
+
   bool sdpa_ran = false;
   bool sdpa_ok = test_sdpa_sweep(sdpa_dir, &sdpa_ran);
   if (sdpa_ran) {