vllm-project · pavanimajety · May 20, 2026 · Apr 17, 2026 · Apr 17, 2026 · Apr 17, 2026
@@ -0,0 +1,229 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+if not current_platform.is_device_capability_family(120):
+    pytest.skip(
+        reason="FlashInfer CuteDSL SM12x MoE requires SM120 "
+        "(RTX Pro 6000 / DGX Spark).",
+        allow_module_level=True,
+    )
+
+from vllm.utils.flashinfer import has_flashinfer_b12x_moe
+
+if not has_flashinfer_b12x_moe():
+    pytest.skip(
+        reason=(
+            "FlashInfer cute_dsl_fused_moe_nvfp4 / convert_sf_to_mma_layout "
+            "not available in installed FlashInfer (needs PRs #3051 and #3066)."
+        ),
+        allow_module_level=True,
+    )
+
+# Import fp4_quantize after the skip guard — FlashInfer must be installed.
+from flashinfer.fp4_quantization import fp4_quantize
+
+import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
+from tests.kernels.utils import torch_moe
+from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
+from vllm.model_executor.layers.fused_moe import fused_topk
+from vllm.model_executor.layers.fused_moe.activation import MoEActivation
+from vllm.model_executor.layers.fused_moe.all2all_utils import (
+    maybe_make_prepare_finalize,
+)
+from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config
+from vllm.model_executor.layers.fused_moe.experts.flashinfer_b12x_moe import (
+    FlashInferB12xExperts,
+)
+from vllm.utils.flashinfer import flashinfer_convert_sf_to_mma_layout
+from vllm.utils.torch_utils import set_random_seed
+
+# Dimensions chosen to satisfy FP4 alignment requirements (k multiple of 256,
+# n multiple of 128) while keeping tests fast.
+MNK_FACTORS = [
+    (2, 128, 256),
+    (2, 256, 512),
+    (16, 128, 256),
+    (64, 256, 512),
+]
+
+
+def _reorder_gate_up_to_up_gate(
+    w: torch.Tensor,
+    w_s: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Swap gate and up-projection halves along dim=1 to [up, gate] order.
+
+    The SM12x kernel expects weights in [up (w3), gate (w1)] order while the
+    BF16 reference uses [gate (w1), up (w3)].  This replicates the reordering
+    done at model-load time by ``prepare_nvfp4_moe_layer_for_fi_or_cutlass``.
+    """
+    n = w.shape[1] // 2
+    return (
+        torch.cat([w[:, n:, :], w[:, :n, :]], dim=1),
+        torch.cat([w_s[:, n:, :], w_s[:, :n, :]], dim=1),
+    )
+
+
+@pytest.mark.parametrize("m,n,k", MNK_FACTORS)
+@pytest.mark.parametrize("e", [8, 16])
+@pytest.mark.parametrize("topk", [1, 2, 4])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
+@torch.inference_mode()
+def test_flashinfer_b12x_moe(
+    m: int,
+    n: int,
+    k: int,
+    e: int,
+    topk: int,
+    dtype: torch.dtype,
+    workspace_init,
+):
+    """Test FlashInferB12xExperts against a BF16 torch reference.
+
+    The SM12x kernel takes BF16 hidden states directly and fuses token
+    dispatch, W1 GEMM, SwiGLU, and W2 GEMM into one call.  We verify
+    correctness against ``torch_moe`` using generous tolerances to account
+    for the internal FP4 quantization of activations and weights.
+
+    Scale convention
+    ----------------
+    The SM12x kernel uses ``w1_alpha`` as *both* the activation-quantisation
+    global scale and the weight dequantisation factor.  These two roles are
+    conflated into a single parameter in ``launch_sm120_moe``, so they must
+    equal the same value.  We use ``global_scale = 1.0`` for
+    ``fp4_quantize`` so that ``w1_alpha = ones`` satisfies both roles
+    simultaneously.  The alternative — vLLM's convention of baking a large
+    ``w_gs`` into block-scale values and compensating with
+    ``g1_alphas = 1/w_gs`` — is incompatible with this kernel.
+    """
+    set_random_seed(7)
+    with set_current_vllm_config(
+        VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
+    ):
+        a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
+
+        # Generate BF16 reference weights in [gate, up] order.
+        # Shape: w1=(e, 2n, k), w2=(e, k, n).
+        w1_bf16 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 15
+        w2_bf16 = torch.randn((e, k, n), device="cuda", dtype=dtype) / 15
+
+        # ------------------------------------------------------------------ #
+        # Quantise weights for the SM12x kernel using FlashInfer's convention:
+        #   global_scale = 1.0   →   block_scale = max_abs_block / fp4_max
+        #   w1_alpha = 1.0       (no extra global factor to compensate)
+        #
+        # The scale factors returned by fp4_quantize(..., is_sf_swizzled_layout=True)
+        # are already in the swizzled 2D layout expected by convert_sf_to_mma_layout.
+        # No additional swizzle_blockscale() call is needed.
+        # ------------------------------------------------------------------ #
+        gs = torch.ones(1, device="cuda", dtype=torch.float32)
+        sf_vec_size = 16
+
+        # W1: reorder BF16 from [gate, up] → [up, gate], then quantise.
+        w1_reordered = torch.cat(
+            [w1_bf16[:, n:, :], w1_bf16[:, :n, :]], dim=1
+        )  # shape (e, 2n, k), [up, gate]
+        w1_flat = w1_reordered.reshape(e * 2 * n, k)
+        w1_q_flat, w1_sf_flat = fp4_quantize(
+            w1_flat,
+            global_scale=gs,
+            sf_vec_size=sf_vec_size,
+            is_sf_swizzled_layout=True,
+        )
+        w1_q = w1_q_flat.view(e, 2 * n, k // 2)  # uint8, packed FP4
+        w1_blockscale = w1_sf_flat.view(e, 2 * n, w1_sf_flat.shape[1])  # float8
+
+        # W2: no row reordering needed for the down-projection.
+        w2_flat = w2_bf16.reshape(e * k, n)
+        w2_q_flat, w2_sf_flat = fp4_quantize(
+            w2_flat,
+            global_scale=gs,
+            sf_vec_size=sf_vec_size,
+            is_sf_swizzled_layout=True,
+        )
+        w2_q = w2_q_flat.view(e, k, n // 2)  # uint8, packed FP4
+        w2_blockscale = w2_sf_flat.view(e, k, w2_sf_flat.shape[1])  # float8
+
+        # All per-expert alphas are 1.0 (global_scale = 1.0, no compensation).
+        ones_e = torch.ones(e, device="cuda", dtype=torch.float32)
+
+        quant_config = nvfp4_moe_quant_config(
+            g1_alphas=ones_e,
+            g2_alphas=ones_e,
+            a1_gscale=ones_e,
+            a2_gscale=ones_e,
+            w1_scale=w1_blockscale,
+            w2_scale=w2_blockscale,
+        )
+
+        moe_config = make_dummy_moe_config(
+            num_experts=e,
+            experts_per_token=topk,
+            hidden_dim=k,
+            intermediate_size_per_partition=n,
+            in_dtype=dtype,
+        )
+
+        experts = FlashInferB12xExperts(
+            moe_config=moe_config,
+            quant_config=quant_config,
+        )
+        # In production, process_weights_after_loading computes these after
+        # normalizing block scales. In the test the scales are already in final
+        # form (global_scale=1.0), so we compute the MMA layouts directly.
+        num_experts_w1, m1, k1_sf = w1_blockscale.shape
+        experts.w1_sf_mma = flashinfer_convert_sf_to_mma_layout(
+            w1_blockscale.reshape(num_experts_w1 * m1, k1_sf),
+            m=m1,
+            k=k1_sf * 16,
+            num_groups=num_experts_w1,
+        )
+        num_experts_w2, m2, k2_sf = w2_blockscale.shape
+        experts.w2_sf_mma = flashinfer_convert_sf_to_mma_layout(
+            w2_blockscale.reshape(num_experts_w2 * m2, k2_sf),
+            m=m2,
+            k=k2_sf * 16,
+            num_groups=num_experts_w2,
+        )
+
+        kernel = mk.FusedMoEKernel(
+            maybe_make_prepare_finalize(
+                moe=moe_config,
+                quant_config=quant_config,
+                allow_new_interface=True,
+                use_monolithic=False,
+            ),
+            experts,
+            inplace=False,
+        )
+
+        score = torch.randn((m, e), device="cuda", dtype=dtype)
+        topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
+
+        sm12x_output = kernel.apply(
+            hidden_states=a,
+            w1=w1_q,
+            w2=w2_q,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            global_num_experts=e,
+            activation=MoEActivation.SILU,
+            apply_router_weight_on_input=False,
+            expert_map=None,
+        )
+
+        # Reference: BF16 torch MoE using original [gate, up] BF16 weights.
+        # torch_moe's SiluAndMul expects [gate, up] order, matching w1_bf16.
+        torch_output = torch_moe(a, w1_bf16, w2_bf16, score, topk)
+
+        torch.testing.assert_close(sm12x_output, torch_output, atol=2e-1, rtol=2e-1)
+
+
+if __name__ == "__main__":
+    test_flashinfer_b12x_moe(16, 128, 256, 8, 2, torch.bfloat16)
@@ -13,6 +13,7 @@
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import (
     flashinfer_scaled_fp4_mm,
+    has_flashinfer_b12x_gemm,
 )
 from vllm.utils.torch_utils import set_random_seed
 
@@ -74,7 +75,7 @@ def get_ref_results(
 @pytest.mark.parametrize("shape", SHAPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("backend", ["cutlass", "cudnn", "trtllm"])
+@pytest.mark.parametrize("backend", ["cutlass", "cudnn", "trtllm", "b12x"])
 @pytest.mark.parametrize("autotune", [False, True])
 @torch.inference_mode()
 def test_flashinfer_nvfp4_gemm(
@@ -87,6 +88,10 @@ def test_flashinfer_nvfp4_gemm(
 ) -> None:
     if "trtllm" in backend and dtype == torch.float16:
         pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations")
+    if backend == "b12x" and not current_platform.has_device_capability(120):
+        pytest.skip("b12x FP4 GEMM requires SM120+ (CC 12.0+)")
+    if backend == "b12x" and not has_flashinfer_b12x_gemm():
+        pytest.skip("b12x FP4 GEMM backend not available in installed FlashInfer")
 
     set_random_seed(seed)
     m, n, packed_k = shape
@@ -105,8 +110,7 @@ def test_flashinfer_nvfp4_gemm(
 
     # ops.scaled_fp4_quant returns swizzled scales, while weights
     # from checkpoints are in linear scales.
-    # So instead of needing to swizzle for cutlass as in modelopt.py,
-    # we need to unswizzle for trtllm here.
+    # cutlass and b12x use swizzled scales directly; trtllm needs them unswizzled.
     a_fp4, a_scale_interleaved = ops.scaled_fp4_quant(
         a_dtype, a_global_scale, is_sf_swizzled_layout=True, backend=backend
     )

@@ -128,6 +128,7 @@ def with_default(
     "flashinfer_trtllm",
     "flashinfer_cutlass",
     "flashinfer_cutedsl",
+    "flashinfer_b12x",
     "marlin",
     "humming",
     "triton_unfused",
@@ -178,6 +179,8 @@ class KernelConfig:
     - "flashinfer_trtllm": Use FlashInfer with TRTLLM-GEN kernels
     - "flashinfer_cutlass": Use FlashInfer with CUTLASS kernels
     - "flashinfer_cutedsl": Use FlashInfer with CuteDSL kernels (FP4 only)
+    - "flashinfer_b12x": Use FlashInfer CuteDSL fused MoE for SM12x
+      (RTX Pro 6000 / DGX Spark)
     - "marlin": Use Marlin kernels (weight-only quantization)
     - "humming": Use Humming Mixed Precision kernels
     - "triton_unfused": Use Triton unfused MoE kernels

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -1535,6 +1535,7 @@ def _get_or_set_default() -> str:
         "VLLM_NVFP4_GEMM_BACKEND",
         None,
         [
+            "flashinfer-b12x",
             "flashinfer-cudnn",
             "flashinfer-trtllm",
             "flashinfer-cutlass",

diff --git a/vllm/model_executor/kernels/linear/__init__.py b/vllm/model_executor/kernels/linear/__init__.py
@@ -102,6 +102,7 @@
     FbgemmNvFp4LinearKernel,
 )
 from vllm.model_executor.kernels.linear.nvfp4.flashinfer import (
+    FlashInferB12xNvFp4LinearKernel,
     FlashInferCudnnNvFp4LinearKernel,
     FlashInferCutlassNvFp4LinearKernel,
     FlashInferTrtllmNvFp4LinearKernel,
@@ -367,6 +368,9 @@ def _filter_kernels_by_backend(
 
 _POSSIBLE_NVFP4_KERNELS: dict[PlatformEnum, list[type[NvFp4LinearKernel]]] = {
     PlatformEnum.CUDA: [
+        # FlashInferB12xNvFp4LinearKernel excluded from auto-selection until
+        # upstream CUTLASS SM121 MMA op guard is resolved; use
+        # VLLM_NVFP4_GEMM_BACKEND=flashinfer-b12x to opt in explicitly.
         FlashInferCutlassNvFp4LinearKernel,
         CutlassNvFp4LinearKernel,
         MarlinNvFp4LinearKernel,
@@ -812,6 +816,7 @@ def init_wfp8_a16_linear_kernel(
 
 # Maps VLLM_NVFP4_GEMM_BACKEND env var values to kernel classes.
 _NVFP4_BACKEND_TO_KERNEL: dict[str, type[NvFp4LinearKernel]] = {
+    "flashinfer-b12x": FlashInferB12xNvFp4LinearKernel,
     "flashinfer-cutlass": FlashInferCutlassNvFp4LinearKernel,
     "cutlass": CutlassNvFp4LinearKernel,
     "marlin": MarlinNvFp4LinearKernel,
@@ -1041,6 +1046,7 @@ def register_linear_kernel(
     "CutlassNvFp4LinearKernel",
     "EmulationNvFp4LinearKernel",
     "FbgemmNvFp4LinearKernel",
+    "FlashInferB12xNvFp4LinearKernel",
     "FlashInferCutlassNvFp4LinearKernel",
     "FlashInferTrtllmNvFp4LinearKernel",
     "FlashInferCudnnNvFp4LinearKernel",