Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
6a1a203
[AMD] add MiniMax-M3 FP8 MI355X ATOM single-node benchmark script
seungrokj Jun 24, 2026
03f5752
[AMD] add minimaxm3-fp8-mi355x-atom recipe and bump atom image to 202…
seungrokj Jun 24, 2026
a245590
[AMD] add perf-changelog entry for minimaxm3-fp8-mi355x-atom
seungrokj Jun 24, 2026
25b8fab
[AMD] include minimaxm3-fp8-mi355x-atom-mtp in perf-changelog entry
seungrokj Jun 24, 2026
0e1a9e4
[AMD] add minimaxm3-fp4-mi355x-atom-mtp and bump atom image to 20260623
seungrokj Jun 24, 2026
c3d8c1f
[AMD] add perf-changelog entry for minimaxm3-fp4-mi355x-atom-mtp
seungrokj Jun 24, 2026
d799dd5
[AMD] revert minimaxm3-fp4-mi355x-atom-mtp image to MiniMax-M3-20260622
seungrokj Jun 24, 2026
2c2bf37
[AMD] remove minimaxm3_fp8_mi355x_atom.sh from fp4 branch (belongs to…
seungrokj Jun 24, 2026
a5c9f29
[AMD] remove fp8 atom perf-changelog entry from fp4 branch (belongs t…
seungrokj Jun 24, 2026
ce65d9a
[AMD] remove minimaxm3-fp8-mi355x-atom-mtp recipe from fp4 branch (be…
seungrokj Jun 24, 2026
266c375
[AMD] rename minimaxm3-fp8-mi355x-atom recipe to atom-mtp with spec-d…
seungrokj Jun 24, 2026
ec2d464
[AMD] swap SPEC_ARGS between fp4 atom and atom-mtp scripts
seungrokj Jun 24, 2026
b649691
[AMD] enable AITER_QUICK_REDUCE_QUANTIZATION=INT4 when CONC < 256 for…
seungrokj Jun 24, 2026
82b1909
[AMD] enable AITER_QUICK_REDUCE_QUANTIZATION=INT4 and disable fp8 kv …
seungrokj Jun 24, 2026
adf9696
[AMD] enable AITER_QUICK_REDUCE_QUANTIZATION=INT4 unconditionally and…
seungrokj Jun 25, 2026
7ec070f
Merge branch 'main' into amd/m3_atom_single_fp4_0623
seungrokj Jun 25, 2026
5edce27
[AMD] restore CONC < 256 condition for AITER_QUICK_REDUCE_QUANTIZATIO…
seungrokj Jun 25, 2026
a922012
[AMD] drop tp=2 search-space entries and enable AITER_QUICK_REDUCE_QU…
seungrokj Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2587,11 +2587,8 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }

# MiniMax-M3 MXFP4 MI355X atom recipe:
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# block size 128 is mandatory for MSA. TP4 on a single gfx950 node, per the recipe.
minimaxm3-fp4-mi355x-atom:
image: rocm/atom-dev:M3
image: rocm/atom-dev:MiniMax-M3-20260623
model: amd/MiniMax-M3-MXFP4
model-prefix: minimaxm3
runner: mi355x
Expand All @@ -2603,14 +2600,30 @@ minimaxm3-fp4-mi355x-atom:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, conc-start: 128, conc-end: 256 }
- { tp: 4, conc-start: 1, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, conc-start: 128, conc-end: 256 }
- { tp: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, conc-start: 1, conc-end: 2 }

minimaxm3-fp4-mi355x-atom-mtp:
image: rocm/atom-dev:MiniMax-M3-20260623
model: amd/MiniMax-M3-MXFP4
model-prefix: minimaxm3
runner: mi355x
precision: fp4
framework: atom
multinode: false
scenarios:
fixed-seq-len:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }

minimaxm3-fp8-mi355x-atom-mtp:
image: rocm/atom-dev:MiniMax-M3-20260622
Expand Down
51 changes: 25 additions & 26 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ check_env_vars \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION \
MAX_MODEL_LEN
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -22,41 +21,41 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO

SERVER_LOG=/workspace/server.log

export OMP_NUM_THREADS=1
PARALLEL_ARGS=(-tp "$TP") #TP
if [ "$DP_ATTENTION" = "true" ]; then
if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
else #DP+TP
PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
fi
fi

# Use the matrix-supplied MAX_MODEL_LEN (isl + osl + 256). Eval-only jobs need a
# larger window for the eval prompts, so override it from the eval context.
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
fi

if [ "$EP_SIZE" -gt 1 ]; then
EP=" --enable-expert-parallel"
else
EP=" "
fi
SPEC_ARGS=()

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x

# Flags follow the ATOM MiniMax-M3 MXFP4 recipe (FP4 on 4xMI355 section):
# https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
# --block-size 128 is mandatory for MiniMax MSA. KV cache is left at the default
# dtype: amd/MiniMax-M3-MXFP4 ships no calibrated FP8 KV scales, so
# --kv_cache_dtype fp8 trips an assertion (k_scale is None) in the MSA
# fused_qknorm kernel during init.
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
export AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0
export ATOM_M3_SPARSE_USE_ASM_PA=1
export MAX_MODEL_LEN=32768
export MAX_NUM_BATCHED_TOKENS=32768
export MAX_NUM_SEQS=256
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
-tp $TP \
--max-model-len $MAX_MODEL_LEN $EP \
"${PARALLEL_ARGS[@]}" \
"${SPEC_ARGS[@]}" \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--max-model-len $MAX_MODEL_LEN \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--max-num-seqs $MAX_NUM_SEQS \
--kv_cache_dtype fp8 \
--trust-remote-code \
--no-enable_prefix_caching \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!
Expand All @@ -76,7 +75,7 @@ run_benchmark_serving \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code
--trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" )

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
Expand All @@ -86,4 +85,4 @@ fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
set +x
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

SERVER_LOG=/workspace/server.log

PARALLEL_ARGS=(-tp "$TP") #TP
if [ "$DP_ATTENTION" = "true" ]; then
if [ "$EP_SIZE" -gt 1 ]; then #DP+EP
PARALLEL_ARGS=(-tp "$TP" --enable-expert-parallel --enable-dp-attention )
else #DP+TP
PARALLEL_ARGS=(-tp "$TP" --enable-dp-attention )
fi
fi

SPEC_ARGS=(--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 3 )

# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
MEM_FRAC_STATIC=0.8

set -x
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
export AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0
export ATOM_M3_SPARSE_USE_ASM_PA=1
export MAX_MODEL_LEN=32768
export MAX_NUM_BATCHED_TOKENS=32768
export MAX_NUM_SEQS=256
python3 -m atom.entrypoints.openai_server \
--model $MODEL \
--server-port $PORT \
"${PARALLEL_ARGS[@]}" \
"${SPEC_ARGS[@]}" \
--block-size 128 \
--gpu-memory-utilization $MEM_FRAC_STATIC \
--max-model-len $MAX_MODEL_LEN \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--max-num-seqs $MAX_NUM_SEQS \
--kv_cache_dtype fp8 \
--trust-remote-code \
--no-enable_prefix_caching \
> $SERVER_LOG 2>&1 &

SERVER_PID=$!

# Wait for server to be ready
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

export PYTHONDONTWRITEBYTECODE=1
run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/ \
--trust-remote-code $( [[ ${#SPEC_ARGS[@]} -gt 0 ]] && echo "--use-chat-template" )

# After throughput, run evaluation only if RUN_EVAL is true
if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

# Stop GPU monitoring
stop_gpu_monitor
set +x
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3984,6 +3984,15 @@
- "Expand search space for minimaxm3-fp4-mi355x-atom: add TP2 and TP8 configurations, extend concurrency range to 256 for ISL1024 and ISL8192, and add TP8 conc=1-2 for ISL8192."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1825

- config-keys:
- minimaxm3-fp4-mi355x-atom
- minimaxm3-fp4-mi355x-atom-mtp
description:
- "Add minimaxm3-fp4-mi355x-atom-mtp: MiniMax-M3 MXFP4 on MI355X with EAGLE3 speculative decoding (3 draft tokens)"
- "Bump image to rocm/atom-dev:MiniMax-M3-20260623 for both fp4 atom entries"
- "Search space: TP2/TP4, ISL=1024,8192, OSL=1024, conc 1–256"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1917

- config-keys:
- minimaxm3-fp8-b300-vllm-mtp
description:
Expand Down