vllm-project · ECMGit · May 21, 2026
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -187,6 +187,10 @@
     VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
         "latency"
     )
+    VLLM_FP8_MOE_BACKEND: (
+        Literal["triton", "deep_gemm", "cutlass", "flashinfer_trtllm",
+                "flashinfer_cutlass", "marlin", "aiter"] | None
+    ) = None
     VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR: str | None = None
     VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto"
     VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
@@ -1551,6 +1555,16 @@ def _resolve_rust_frontend_path() -> str | None:
             ["throughput", "latency", "masked_gemm"],
         ),
     ),
+    # FP8 MoE backend override. `--moe-backend` is consumed by both the
+    # NVFP4 and FP8 dispatchers; FP4-only kernels (e.g. flashinfer_b12x)
+    # have no FP8 equivalent. Set this to route FP8 experts in mixed-
+    # precision checkpoints to a different backend.
+    "VLLM_FP8_MOE_BACKEND": env_with_choices(
+        "VLLM_FP8_MOE_BACKEND",
+        None,
+        ["triton", "deep_gemm", "cutlass", "flashinfer_trtllm",
+         "flashinfer_cutlass", "marlin", "aiter"],
+    ),
     # Override the directory for the FlashInfer autotune config cache.
     "VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR": lambda: os.getenv(
         "VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR", None

@@ -202,7 +202,15 @@ def backend_to_kernel_cls(
 
 
 def map_fp8_backend(runner_backend: MoEBackend) -> Fp8MoeBackend:
-    """Map user's MoEBackend to Fp8MoeBackend."""
+    """Map user's MoEBackend to Fp8MoeBackend.
+
+    ``--moe-backend`` is consumed by both the NVFP4 and FP8 dispatchers.
+    For FP4-only kernels (e.g. ``flashinfer_b12x``), the FP8 dispatcher
+    has no equivalent and the mapping below would otherwise raise. Set
+    ``VLLM_FP8_MOE_BACKEND`` to override only the FP8-side routing while
+    keeping the user's ``--moe-backend`` value for the NVFP4 path.
+    """
+    effective = envs.VLLM_FP8_MOE_BACKEND or runner_backend
     mapping = {
         "triton": Fp8MoeBackend.TRITON,
         "deep_gemm": Fp8MoeBackend.DEEPGEMM,
@@ -212,10 +220,12 @@ def map_fp8_backend(runner_backend: MoEBackend) -> Fp8MoeBackend:
         "marlin": Fp8MoeBackend.MARLIN,
         "aiter": Fp8MoeBackend.AITER,
     }
-    if backend := mapping.get(runner_backend):
+    if backend := mapping.get(effective):
         return backend
+    src = ("VLLM_FP8_MOE_BACKEND env var"
+           if envs.VLLM_FP8_MOE_BACKEND else "--moe-backend")
     raise ValueError(
-        f"moe_backend='{runner_backend}' is not supported for FP8 MoE. "
+        f"FP8 MoE backend='{effective}' (from {src}) is not supported. "
         f"Expected one of {list(mapping.keys())}."
     )