Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2632,7 +2632,7 @@ minimaxm3-fp8-mi355x-atom-mtp:
- { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }

minimaxm3-fp8-mi355x-atom-disagg:
image: rocm/atom-dev:MiniMax-M3-20260622
image: rocm/atom-dev:MiniMax-M3-20260623
model: amd/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x-disagg
Expand Down
26 changes: 7 additions & 19 deletions benchmarks/multi_node/amd_utils/env_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,24 @@ else
fi
export IBDEVICES

export SAFETENSORS_FAST_GPU=1
export VLLM_LOG_LEVEL=WARNING
export ATOM_LOG_LEVEL=WARNING
export AITER_LOG_LEVEL=WARNING
export LOG_LEVEL=WARNING
export LOGLEVEL=WARNING

# =============================================================================
# ATOM/mooncake-specific environment
# =============================================================================

# mooncake RDMA KV transfer library path
export LD_LIBRARY_PATH=/opt/venv/lib/python3.10/site-packages/mooncake:/opt/rocm/lib:${LD_LIBRARY_PATH:-}


# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
# faster model loading (safetensors only)
export SAFETENSORS_FAST_GPU=1

# aiter logging (WARNING to reduce noise; use DEBUG for troubleshooting)
export VLLM_LOG_LEVEL=WARNING
export ATOM_LOG_LEVEL=WARNING
export AITER_LOG_LEVEL=WARNING

if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
# ATOM MoE gather/scatter interleave optimization
export ATOM_MOE_GU_ITLV=1
# Disable bf16->fp8 MoE bound (only for DeepSeek-V4-Pro)
export AITER_BF16_FP8_MOE_BOUND=0
fi

# Clear stale ATOM cache on startup (server_atom.sh handles this via rm -rf)
# No env var needed; documented here for reference.
export LOG_LEVEL=WARNING
export LOGLEVEL=WARNING

set +x

# ATOM_HOST_IP is set per-node in server_atom.sh (= host_ip, used as handshake IP)
echo "[INFO] ATOM env: IBDEVICES=$IBDEVICES LD_LIBRARY_PATH includes mooncake"
68 changes: 26 additions & 42 deletions benchmarks/multi_node/amd_utils/models_atom.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Model-specific SGLang server configurations for disaggregated inference.
# Model-specific ATOM server configurations for disaggregated inference.
#
# Each top-level key is a MODEL_NAME value (must match the directory name under MODEL_DIR).
#
Expand All @@ -7,50 +7,34 @@
#
# Schema:
# <model-name>:
# base_flags: str # Common flags for both prefill and decode
# mtp_flags: str # Appended to decode when DECODE_MTP_SIZE > 0
# dp_flags: str # Appended when DP is enabled (prefill or decode)
# prefill:
# mem_fraction_static: float
# disable_radix_cache: bool
# dp: # Config when data-parallel attention is enabled
# max_running_requests: int
# chunked_prefill_size: str # Can be integer or bash arithmetic expression
# cuda_graph_bs: str # Space-separated values
# no_dp: # Config when data-parallel attention is disabled
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str # "start-end" expanded via seq
# decode:
# mem_fraction_static: float
# prefill_round_robin_balance: bool
# dp:
# max_running_requests: int
# chunked_prefill_size: str
# cuda_graph_bs_range: str
# ep_only: # Config when EP is enabled but DP is disabled
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str
# no_dp:
# max_running_requests: int
# chunked_prefill_size: int
# cuda_graph_bs_range: str
# env: str # Space-separated KEY=VALUE pairs exported unconditionally
# hf_overrides: str # JSON string passed to --hf-overrides
# tp_dp_flags: str # Parallel flags for TP+DPA case (must include --enable-dp-attention)
# tp_dp_env: str # Space-separated KEY=VALUE pairs exported only in TP+DPA mode
# ep_dp_flags: str # Parallel flags for EP+DPA case (must include --enable-expert-parallel --enable-dp-attention)
# ep_dp_env: str # Space-separated KEY=VALUE pairs exported only in EP+DPA mode
# mtp_flags: str # Flags passed to SPEC_ARGS before $DECODE_MTP_SIZE (e.g. "--method mtp --num-speculative-tokens")
# kv_cache_flags: str # Full --kv_cache_dtype flag string (e.g. "--kv_cache_dtype fp8", or "" for none)

DeepSeek-V4-Pro:
# ATOM engine (atom-disagg): server_atom.sh uses MEM_FRACTION/KV_CACHE_DTYPE/BLOCK_SIZE/MAX_NUM_SEQS
# directly from env vars (defaulting to 0.85/fp8/16/256). base_flags/dp_flags are not used by
# server_atom.sh; they are kept here for documentation and potential future use.
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "ATOM_MOE_GU_ITLV=1 AITER_BF16_FP8_MOE_BOUND=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention --enable-tbo"
tp_dp_env: "GPU_MAX_HW_QUEUES=5 ATOM_CPU_AFFINITY=1"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method mtp --num-speculative-tokens"
hf_overrides: '{"use_index_cache":true,"index_topk_freq":4}'

MiniMax-M3-MXFP4:
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"

MiniMax-M3-MXFP8:
base_flags: ""
mtp_flags: ""
dp_flags: ""
env: "AITER_QUICK_REDUCE_QUANTIZATION=INT4 ATOM_M3_SPARSE_USE_ASM_PA=1 AITER_QUICK_REDUCE_CAST_BF16_TO_FP16=0"
kv_cache_flags: "--kv_cache_dtype fp8"
tp_dp_flags: "--enable-dp-attention"
ep_dp_flags: "--enable-expert-parallel --enable-dp-attention"
mtp_flags: "--method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens"
Comment thread
seungrokj marked this conversation as resolved.
89 changes: 49 additions & 40 deletions benchmarks/multi_node/amd_utils/server_atom.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}"

# ATOM server tuning (from reference script defaults)
MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}"
KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}"
BLOCK_SIZE="${BLOCK_SIZE:-16}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
Expand Down Expand Up @@ -78,6 +77,24 @@ if [[ -z "$host_ip" ]]; then
fi
host_name=$(hostname)

# =============================================================================
# Model-Specific Configuration from YAML
# =============================================================================
# Load model-specific config from YAML (single parse for all fields)
eval "$(python3 -c "
import yaml
with open('${ATOM_WS_PATH}/models_atom.yaml') as f:
m = yaml.safe_load(f).get('${MODEL_NAME}', {})
print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"')
print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"')
print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"')
print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"')
print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"')
print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"')
print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"')
Comment on lines +80 to +94

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 hf_overrides JSON quotes stripped by bash eval — breaks DeepSeek-V4-Pro startup. The Python f-string print(f'_HF_OVERRIDES="{m.get("hf_overrides", "")}"') interpolates the raw YAML value ({"use_index_cache":true,"index_topk_freq":4}) into an outer double-quoted bash assignment, so the inner " characters terminate the outer string. After eval, _HF_OVERRIDES holds {use_index_cache:true,index_topk_freq:4} — unquoted keys — and --hf-overrides '{use_index_cache:true,index_topk_freq:4}' is rejected as invalid JSON, blocking the documented Verify server_atom.sh launches correctly for DeepSeek-V4-Pro test plan item. Fix: emit the value with shlex.quote() (or json.dumps) in the Python so the JSON survives the bash eval intact.

Extended reasoning...

What the bug is

The new YAML-driven loader at server_atom.sh:80-94 runs a Python snippet whose output is captured and eval'd as bash. The hf_overrides field for DeepSeek-V4-Pro in models_atom.yaml is the JSON string {"use_index_cache":true,"index_topk_freq":4}. The emitter line is:

print(f'_HF_OVERRIDES="{m.get("hf_overrides", "")}"')

For DeepSeek-V4-Pro this prints literally:

_HF_OVERRIDES="{"use_index_cache":true,"index_topk_freq":4}"

The inner unescaped " characters terminate the outer double-quoted region. Bash treats the line as a concatenation of quoted and unquoted word segments — "{" + use_index_cache + ":true," + index_topk_freq + ":4}" — and assembles them into a single word with all the quotes stripped.

Step-by-step proof

Reproduced locally with the exact Python emitter and bash eval:

$ python3 -c 'm={"hf_overrides": "{\"use_index_cache\":true,\"index_topk_freq\":4}"}; print(f"_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"")'
_HF_OVERRIDES="{"use_index_cache":true,"index_topk_freq":4}"

$ eval '_HF_OVERRIDES="{"use_index_cache":true,"index_topk_freq":4}"' && echo "[$_HF_OVERRIDES]"
[{use_index_cache:true,index_topk_freq:4}]

$ python3 -c 'import json; json.loads("{use_index_cache:true,index_topk_freq:4}")'
json.decoder.JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

After the eval, the downstream line

HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'"

produces --hf-overrides '{use_index_cache:true,index_topk_freq:4}' — an invalid JSON literal with unquoted keys. The atom server's argparse / json.loads on --hf-overrides will reject this at startup.

Why existing code doesn't prevent it

The pre-PR code hard-coded the value as a bash string with backslash-escaped inner quotes:
HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
That escaping is exactly what survives bash parsing, and it is what the new YAML-driven path loses. DeepSeek-V4-Pro is the only model in models_atom.yaml with a non-empty hf_overrides (the other models' YAML fields contain no " characters, so they are unaffected); the other emitted assignments (env, tp_dp_flags, etc.) are safe.

Impact

This regresses the dsv4-fp4-mi355x-atom-disagg recipe in amd-master.yaml (which sets MODEL_NAME=DeepSeek-V4-Pro and routes through server_atom.sh). The server will fail at startup when atom's argparse calls json.loads on the --hf-overrides argument — and this is precisely the path the PR's own test plan flags (Verify server_atom.sh launches correctly for DeepSeek-V4-Pro).

Fix

Quote the value in the Python emitter so the bash eval sees a properly-escaped literal. Either:

import shlex
print(f'_HF_OVERRIDES={shlex.quote(m.get("hf_overrides", ""))}')

(produces _HF_OVERRIDES='{"use_index_cache":true,"index_topk_freq":4}', which bash parses correctly), or write each value to a NUL-delimited side channel that bash reads with read -d '' instead of evaling arbitrary Python output.

print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"')
")"

# =============================================================================
# Cluster Topology Configuration
# =============================================================================
Expand Down Expand Up @@ -114,53 +131,48 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}"
# Parallel args
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
if [ "$PREFILL_ENABLE_DP" = "true" ]; then
if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
else #TP+DPA+TBO
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo )
export GPU_MAX_HW_QUEUES=5
export ATOM_CPU_AFFINITY=1
else #TP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention )
fi
if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS})
for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
else #TP+DPA
PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS})
for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
fi
fi
fi

# (srok), split DPA & TBO cases
DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP
if [ "$DECODE_ENABLE_DP" = "true" ]; then
if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention )
else #TP+DPA+TBO
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo )
export GPU_MAX_HW_QUEUES=5
export ATOM_CPU_AFFINITY=1
else #TP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention )
fi
if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS})
for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done
else #TP+DPA
DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS})
for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done
fi
fi

# MTP args
SPEC_ARGS=() #TP
if [ "$SPEC_DECODING" = "mtp" ]; then
SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE")
fi
unset _dp_env_pair

# HF overrides (single-quoted JSON preserved through eval)
HF_OVERRIDES_ARG=""
if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then
HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'"
if [[ -n "$_HF_OVERRIDES" ]]; then
HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'"
fi
unset _HF_OVERRIDES

for _env_pair in ${MODEL_ENVS}; do
export "$_env_pair"
done
unset _env_pair

# KV cache dtype (skip if unset or 'auto')
KV_CACHE_ARG=""
if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then
KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}"
# MTP args
SPEC_ARGS=()
if [ "$SPEC_DECODING" = "mtp" ]; then
SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE")
fi

# KV cache arg - full flag string from YAML
KV_CACHE_ARG="${MODEL_KV_ARG}"

# Optional model length / batched-token cap
MODEL_LEN_ARGS=""
if [[ -n "$MAX_MODEL_LEN" ]]; then
Expand All @@ -170,9 +182,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then
MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}"
fi

if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
fi

cat <<INFO
=== Configuration ===
Expand All @@ -183,7 +192,7 @@ MODEL : ${MODEL_NAME}
BACKEND : atom (PD mooncake KV transfer)
MTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}
xP/yD : ${xP} / ${yD}
KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
KV cache : ${KV_CACHE_ARG:-none} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC}
Comment on lines 193 to +195

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Nit: line 193's INFO banner prints the literal string MTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}, but mtp_flags is now YAML-driven and MiniMax-M3-MXFP4/MXFP8 use --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3. When those models run with SPEC_DECODING=mtp, the banner will misleadingly claim method=mtp. Pure log/cosmetic — the Spec args : ${SPEC_ARGS[*]} line immediately below prints the actual flags. Suggest dropping the hardcoded method=mtp (the Spec args line already covers it) or replacing with ${MODEL_MTP_FLAGS}.

Extended reasoning...

What\n\nIn benchmarks/multi_node/amd_utils/server_atom.sh the === Configuration === heredoc still contains a hardcoded line:\n\n\nMTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE}\n\n\nThis line predates the YAML-driven refactor in this PR. With the new mtp_flags field, MODEL_MTP_FLAGS can be anything — for the two new MiniMax-M3 entries it is --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens. So the banner can advertise method=mtp even when the server is actually being launched with EAGLE3 flags.\n\n### Step-by-step proof\n\n1. Set MODEL_NAME=MiniMax-M3-MXFP8, SPEC_DECODING=mtp, DECODE_MTP_SIZE=2.\n2. The YAML block (models_atom.yaml:34-39) supplies mtp_flags: --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens, so MODEL_MTP_FLAGS is the EAGLE3 string.\n3. server_atom.sh:174-176 builds SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") → the actual server is launched with --method eagle3 --draft-model … --num-speculative-tokens 2.\n4. server_atom.sh:193 still prints MTP : method=mtp num_speculative_tokens=2 — factually wrong about method.\n5. server_atom.sh:199 immediately below prints Spec args : --method eagle3 --draft-model Inferact/MiniMax-M3-EAGLE3 --num-speculative-tokens 2, which is correct.\n\n### Why this doesn't break anything\n\nThe SPEC_ARGS array (the actual flags passed to python3 -m atom.entrypoints.openai_server) is built correctly from MODEL_MTP_FLAGS; the bug is exclusively in the banner. The Spec args line two lines below shows the truth, so an operator inspecting the log can see the real flags. Also note: the new minimaxm3-fp8-mi355x-atom-disagg recipe sets SPEC_DECODING=none and DECODE_MTP_SIZE=0, so the misleading line is not exercised by anything this PR enables today — it's a latent issue for whenever a MiniMax-M3 atom-disagg recipe is added with MTP on.\n\n### How to fix\n\nReplace line 193 with one of:\n\n\nMTP : ${MODEL_MTP_FLAGS} ${DECODE_MTP_SIZE}\n\n\nor simply drop the line — Spec args : ${SPEC_ARGS[*]} directly below already covers the same information accurately. Severity is nit because it is purely cosmetic and the correct values are visible one line lower.

Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset}
Prefill args : ${PREFILL_PARALLEL_ARGS[*]}
Decode args : ${DECODE_PARALLEL_ARGS[*]}
Expand Down
1 change: 0 additions & 1 deletion benchmarks/multi_node/minimaxm3_fp8_mi355x_atom-disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ export SPEC_DECODING="none"
export DECODE_MTP_SIZE=0

# Block size 128
export KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-auto}"
export BLOCK_SIZE="${BLOCK_SIZE:-128}"
export MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.8}"
export MAX_MODEL_LEN=32768
Expand Down
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4183,3 +4183,11 @@
- "server_atom.sh: fix _MAX_CONC assignment before cudagraph size check; gate ATOM_MOE_GU_ITLV/AITER_BF16_FP8_MOE_BOUND on DeepSeek-V4-Pro only"
- "Search space: ISL=8192 and ISL=1024, 1P1D TP4, conc 1-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1927

- config-keys:
- minimaxm3-fp8-mi355x-atom-disagg
description:
- "Refactor server_atom.sh: eliminate all hardcoded model-name checks; drive all model-specific config (env vars, parallel flags, MTP flags, KV cache flags, HF overrides) from models_atom.yaml"
- "models_atom.yaml: add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries with EAGLE3 MTP flags; add DeepSeek-V4-Pro with TBO/cpu-affinity TP+DPA env and MTP flags; add tp_dp_flags, ep_dp_flags, tp_dp_env, ep_dp_env, kv_cache_flags, mtp_flags, hf_overrides fields"
- "Image bump for minimaxm3-fp8-mi355x-atom-disagg: rocm/atom-dev:MiniMax-M3-20260622 -> rocm/atom-dev:MiniMax-M3-20260623"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1930