Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12375,6 +12375,43 @@ minimaxm3-fp8-b300-vllm:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of MiniMax-M3 NVFP4
# (nvidia/MiniMax-M3-NVFP4) B300 single-node vLLM, pairing the target with the
# Inferact/MiniMax-M3-EAGLE3 draft head (3 speculative tokens). MiniMax-M3
# modelopt NVFP4 support (vllm-project/vllm PR #46380) is baked into the perf
# container image, so no runtime patch is needed; prompts are routed through the
# chat template. Target weights are pre-staged read-only at
# /scratch/models/MiniMax-M3-NVFP4 (added to the STAGED_MODELS allow-list in
# launch_b300-nv.sh); the EAGLE3 draft is downloaded to the writable models dir.
minimaxm3-fp4-b300-vllm-mtp:
image: vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41
model: nvidia/MiniMax-M3-NVFP4
model-prefix: minimaxm3
runner: b300
precision: fp4
framework: vllm
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 128, conc-end: 512, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 day-zero (https://recipes.vllm.ai/MiniMaxAI/MiniMax-M3).
# 427B total / 26B active MoE with MSA sparse attention; MXFP8 checkpoint
# (MiniMaxAI/MiniMax-M3-MXFP8, ~444 GB) quantized by NVIDIA — native MX tensor
Expand Down
112 changes: 112 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_b300_mtp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env bash

# MiniMax-M3 NVFP4 B300 single-node vLLM recipe with EAGLE3 speculative
# decoding — same shape as minimaxm3_fp8_b300_mtp.sh but uses the
# nvidia/MiniMax-M3-NVFP4 checkpoint. MiniMax-M3 modelopt NVFP4 support
# (vllm-project/vllm PR #46380) is baked into the perf container image, so no
# runtime patch is needed.

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
EP_SIZE \
DP_ATTENTION \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

DRAFT_MODEL="Inferact/MiniMax-M3-EAGLE3"

# The target weights are launched from MODEL_PATH (the b300 launcher points it
# at the pre-staged read-only /scratch/models/MiniMax-M3-NVFP4). The EAGLE3
# draft is not pre-staged and must be downloaded, so it cannot live next to the
# read-only target — fetch it into the writable models dir (/data/models)
# instead. When MODEL_PATH is unset (stand-alone runs) fall back to the HF cache.
if [[ -n "${MODEL_PATH:-}" ]]; then
if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$MODEL" --local-dir "$MODEL_PATH"
fi
DRAFT_MODEL_PATH="/data/models/${DRAFT_MODEL##*/}"
if [[ ! -d "$DRAFT_MODEL_PATH" || -z "$(ls -A "$DRAFT_MODEL_PATH" 2>/dev/null)" ]]; then
hf download "$DRAFT_MODEL" --local-dir "$DRAFT_MODEL_PATH"
fi
else
hf download "$MODEL"
export MODEL_PATH="$MODEL"
hf download "$DRAFT_MODEL"
DRAFT_MODEL_PATH="$DRAFT_MODEL"
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

nvidia-smi

SERVER_LOG=/workspace/server.log

export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_FLOAT32_MATMUL_PRECISION=high

if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
else
PARALLEL_ARGS="--tensor-parallel-size=$TP"
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi
start_gpu_monitor

set -x
vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port $PORT \
$PARALLEL_ARGS \
--gpu-memory-utilization 0.90 \
--max-model-len $MAX_MODEL_LEN \
--block-size 128 \
--language-model-only \
--max-cudagraph-capture-size 2048 \
--max-num-batched-tokens "$((ISL * 2 ))" \
--speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL_PATH\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS, \"attention_backend\": \"FLASH_ATTN\"}" \
--stream-interval 20 --no-enable-prefix-caching \
--trust-remote-code > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code \
--use-chat-template

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4205,6 +4205,15 @@
- "Initial submission: MiniMax-M3 MXFP4 disagg (prefill/decode) on MI355X with vLLM over the MoRI-IO KV connector (8k/1k)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1914

- config-keys:
- minimaxm3-fp4-b300-vllm-mtp
description:
- "Add MiniMax-M3 NVFP4 (nvidia/MiniMax-M3-NVFP4) B300 single-node aggregated vLLM benchmark with EAGLE3 speculative decoding (spec-decoding: mtp, 3 draft tokens via Inferact/MiniMax-M3-EAGLE3)"
- "Image vllm/vllm-openai:vllm-minimax-m3-perf-x86_64-13.0.1-8b00f41 (bakes in MiniMax-M3 modelopt NVFP4 support, vllm-project/vllm PR #46380; no runtime patch needed); prompts routed through the chat template"
- "Target weights pre-staged read-only at /scratch/models/MiniMax-M3-NVFP4 (added MiniMax-M3-NVFP4 to launch_b300-nv.sh STAGED_MODELS); EAGLE3 draft downloaded to the writable /data/models; --block-size 128 (MSA), --language-model-only"
- "Sweeps tp 4/8 with and without EP and dp-attn at 1k1k and 8k1k, conc 1-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1929

- config-keys:
- minimaxm3-fp4-mi355x-vllm
description:
Expand Down
1 change: 1 addition & 0 deletions runners/launch_b300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -362,6 +362,7 @@ else
MiniMax-M2.7
MiniMax-M2.7-NVFP4
MiniMax-M3
MiniMax-M3-NVFP4
Qwen3.5-397B-A17B
Qwen3.5-397B-A17B-FP8
Qwen3.5-397B-A17B-NVFP4
Expand Down