Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@
VLLM_FLASHINFER_MOE_BACKEND: Literal["throughput", "latency", "masked_gemm"] = (
"latency"
)
VLLM_FP8_MOE_BACKEND: (
Literal["triton", "deep_gemm", "cutlass", "flashinfer_trtllm",
"flashinfer_cutlass", "marlin", "aiter"] | None
) = None
VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR: str | None = None
VLLM_FLASHINFER_ALLREDUCE_BACKEND: Literal["auto", "trtllm", "mnnvl"] = "auto"
VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE: int = 394 * 1024 * 1024
Expand Down Expand Up @@ -1551,6 +1555,16 @@ def _resolve_rust_frontend_path() -> str | None:
["throughput", "latency", "masked_gemm"],
),
),
# FP8 MoE backend override. `--moe-backend` is consumed by both the
# NVFP4 and FP8 dispatchers; FP4-only kernels (e.g. flashinfer_b12x)
# have no FP8 equivalent. Set this to route FP8 experts in mixed-
# precision checkpoints to a different backend.
"VLLM_FP8_MOE_BACKEND": env_with_choices(
"VLLM_FP8_MOE_BACKEND",
None,
["triton", "deep_gemm", "cutlass", "flashinfer_trtllm",
"flashinfer_cutlass", "marlin", "aiter"],
),
# Override the directory for the FlashInfer autotune config cache.
"VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR": lambda: os.getenv(
"VLLM_FLASHINFER_AUTOTUNE_CACHE_DIR", None
Expand Down
16 changes: 13 additions & 3 deletions vllm/model_executor/layers/fused_moe/oracle/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,15 @@ def backend_to_kernel_cls(


def map_fp8_backend(runner_backend: MoEBackend) -> Fp8MoeBackend:
"""Map user's MoEBackend to Fp8MoeBackend."""
"""Map user's MoEBackend to Fp8MoeBackend.

``--moe-backend`` is consumed by both the NVFP4 and FP8 dispatchers.
For FP4-only kernels (e.g. ``flashinfer_b12x``), the FP8 dispatcher
has no equivalent and the mapping below would otherwise raise. Set
``VLLM_FP8_MOE_BACKEND`` to override only the FP8-side routing while
keeping the user's ``--moe-backend`` value for the NVFP4 path.
"""
effective = envs.VLLM_FP8_MOE_BACKEND or runner_backend
mapping = {
"triton": Fp8MoeBackend.TRITON,
"deep_gemm": Fp8MoeBackend.DEEPGEMM,
Expand All @@ -212,10 +220,12 @@ def map_fp8_backend(runner_backend: MoEBackend) -> Fp8MoeBackend:
"marlin": Fp8MoeBackend.MARLIN,
"aiter": Fp8MoeBackend.AITER,
}
if backend := mapping.get(runner_backend):
if backend := mapping.get(effective):
return backend
src = ("VLLM_FP8_MOE_BACKEND env var"
if envs.VLLM_FP8_MOE_BACKEND else "--moe-backend")
raise ValueError(
f"moe_backend='{runner_backend}' is not supported for FP8 MoE. "
f"FP8 MoE backend='{effective}' (from {src}) is not supported. "
f"Expected one of {list(mapping.keys())}."
)

Expand Down