Address QMoE review feedback on SM80 prepack docs and checks

Copilot · web-flow · commit 7cf33d36f7f7 · 2026-06-10T20:46:42.000Z
diff --git a/docs/contrib_ops/cuda/moe_qmoe.md b/docs/contrib_ops/cuda/moe_qmoe.md
@@ -71,7 +71,7 @@ input tokens → router (top-k softmax) → permute by expert
 | `expert_weight_bits` (QMoE only) | int | 4 | 4 (INT4/MXFP4) or 8 (INT8/FP8). |
 | `block_size` (QMoE only) | int | -1 | Group size for INT4/INT8 group-wise quantization. -1 = per-output-channel. |
 | `quant_type` (QMoE only) | string | `"int"` | `"int"`, `"fp4"`, `"fp8"`, `"wfp4afp8"`. See [§3](#3-quantization-modes). |
-| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. The prepacked layouts selected by `-1` and `1` are **EP-determined**. `-1` (default): the INT4/INT8 `fc1`/`fc2` initializers are already prepacked in the EP's default layout (e.g. from `pack_weights_for_cuda_mixed_gemm` for the CUDA EP). `1`: already prepacked in the EP's SM90 (Hopper) layout. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()` for the runtime arch. **Note:** the CUDA EP INT4/INT8 MoE GEMM always runs the Ampere (SM80) kernel — even on SM90 — so it consumes the SM80 `fpA_intB` layout on all architectures; `-1` and `1` are therefore equivalent for the CUDA EP today, and `1` is reserved for a possible future Hopper-specific layout. See [§5.1](#51-weights-input-2--5--8). |
+| `weights_prepacked` (QMoE only) | int | -1 | Tri-state, only meaningful when `quant_type="int"`. The prepacked layouts selected by `-1` and `1` are **EP-determined**. `-1` (default): the INT4/INT8 `fc1`/`fc2` initializers are already prepacked in the EP's default layout (e.g. from `pack_weights_for_cuda_mixed_gemm` for the CUDA EP). `1`: already prepacked in an alternate EP-selected layout. `0`: the initializers are raw `[E, N, K/pack]` tensors (as produced by `quantize_matmul_{4,8}bits`) and the kernel runs the CUTLASS layout transform in `PrePack()`. **Note:** the CUDA EP INT4/INT8 MoE GEMM always runs the Ampere (SM80) kernel — even on SM90 — so it consumes the SM80 `fpA_intB` layout on all architectures; `-1` and `1` are therefore equivalent for the CUDA EP today, and `1` is reserved for a possible future Hopper-specific layout. See [§5.1](#51-weights-input-2--5--8). |
 
 ### 2.2 Type Constraints
 
@@ -1017,8 +1017,8 @@ over-aligned by-value parameters.
 - **In-`PrePack` INT weight layout transform** (`weights_prepacked=0`) is
   currently covered only by a smoke test (`TestQMoEIntPrePackSmoke`), not a
   bit-parity check: the existing offline pre-pack harness hardcodes
-  `force_arch=80` and produces incorrect output on SM≥90, so a parity
-  comparison against it is omitted until that harness honours the runtime SM.
+  `force_arch=80` (the same SM80 layout consumed by the CUDA EP on all GPUs),
+  so a separate parity harness for this path is still pending.
 - **Hopper W4A8** (INT4 weight + FP8 activation) is not supported — TRT-LLM gates
   its fast path to SM89 only.
 
diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc b/onnxruntime/contrib_ops/cuda/moe/moe_quantization.cc
@@ -67,7 +67,7 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE
   // concrete prepacked layouts selected by -1 and 1 are determined by the
   // execution provider. The CUDA EP maps the tri-state as:
   //   -1 (default): already prepacked in the EP's default int weight layout.
-  //    1: already prepacked in the EP's SM90 (Hopper) int weight layout.
+  //    1: already prepacked in an alternate EP-selected int weight layout.
   //    0: raw [E, N, K/pack] initializers; the PrePack hook lays them out.
   //
   // Important: the CUDA QMoE int4/int8 MoE GEMM always dispatches to the
@@ -77,6 +77,8 @@ QMoE::QMoE(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info), MoE
   // consumes the SM80/Ampere CUTLASS fpA_intB layout on every GPU. As a result
   // the EP default (-1) is the SM80 layout regardless of the runtime device SM,
   // and SM80-format weights are valid on SM90 (they run via the SM80 kernel).
+  // For CUDA today, -1 and 1 are equivalent (both SM80 layout), and 1 is
+  // reserved for a possible future Hopper-specific layout.
   // PrePack (weights_prepacked=0) packs for the SM80 layout accordingly.
   const int64_t weights_prepacked_mode =
       op_kernel_info.GetAttrOrDefault<int64_t>("weights_prepacked", static_cast<int64_t>(-1));
@@ -1154,6 +1156,9 @@ void QMoE::PrePackIntExpertWeights(const Tensor& tensor, cudaStream_t stream, Al
                                    IAllocatorUniquePtr<void>& packed_buf, bool& is_packed) {
   ORT_ENFORCE(expert_weight_bits_ == 4 || expert_weight_bits_ == 8,
               "PrePackIntExpertWeights: only 4 and 8 bits are supported, got ", expert_weight_bits_);
+  ORT_ENFORCE(sm_ >= 75,
+              "PrePackIntExpertWeights: quant_type='int' with weights_prepacked=0 requires SM75+ CUDA hardware, got SM",
+              sm_);
   const auto& shape = tensor.Shape();
   ORT_ENFORCE(shape.NumDimensions() == 3,
               "PrePackIntExpertWeights: expected 3-D weight tensor [E, N, K/pack], got ndim=",