-
Notifications
You must be signed in to change notification settings - Fork 208
[AMD] Add MiniMax-M3-FP4 MI355X ATOMESH update 0623 #1930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 3 commits
a07ef93
ecda65b
872f3ff
3d714a9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -47,7 +47,6 @@ HANDSHAKE_PORT="${HANDSHAKE_PORT:-6301}" | |
|
|
||
| # ATOM server tuning (from reference script defaults) | ||
| MEM_FRAC_STATIC="${MEM_FRAC_STATIC:-0.85}" | ||
| KV_CACHE_DTYPE="${KV_CACHE_DTYPE:-fp8}" | ||
| BLOCK_SIZE="${BLOCK_SIZE:-16}" | ||
| MAX_NUM_SEQS="${MAX_NUM_SEQS:-256}" | ||
| MAX_MODEL_LEN="${MAX_MODEL_LEN:-}" | ||
|
|
@@ -78,6 +77,24 @@ if [[ -z "$host_ip" ]]; then | |
| fi | ||
| host_name=$(hostname) | ||
|
|
||
| # ============================================================================= | ||
| # Model-Specific Configuration from YAML | ||
| # ============================================================================= | ||
| # Load model-specific config from YAML (single parse for all fields) | ||
| eval "$(python3 -c " | ||
| import yaml | ||
| with open('${ATOM_WS_PATH}/models_atom.yaml') as f: | ||
| m = yaml.safe_load(f).get('${MODEL_NAME}', {}) | ||
| print(f'MODEL_ENVS=\"{m.get(\"env\", \"\")}\"') | ||
| print(f'MODEL_TP_DP_FLAGS=\"{m.get(\"tp_dp_flags\", \"\")}\"') | ||
| print(f'MODEL_EP_DP_FLAGS=\"{m.get(\"ep_dp_flags\", \"\")}\"') | ||
| print(f'MODEL_TP_DP_ENV=\"{m.get(\"tp_dp_env\", \"\")}\"') | ||
| print(f'MODEL_EP_DP_ENV=\"{m.get(\"ep_dp_env\", \"\")}\"') | ||
| print(f'MODEL_MTP_FLAGS=\"{m.get(\"mtp_flags\", \"\")}\"') | ||
| print(f'MODEL_KV_ARG=\"{m.get(\"kv_cache_flags\", \"\")}\"') | ||
|
Comment on lines
+80
to
+94
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 hf_overrides JSON quotes stripped by bash Extended reasoning...What the bug isThe new YAML-driven loader at print(f'_HF_OVERRIDES="{m.get("hf_overrides", "")}"')For DeepSeek-V4-Pro this prints literally: The inner unescaped Step-by-step proofReproduced locally with the exact Python emitter and bash After the HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'"produces Why existing code doesn't prevent itThe pre-PR code hard-coded the value as a bash string with backslash-escaped inner quotes: ImpactThis regresses the FixQuote the value in the Python emitter so the bash import shlex
print(f'_HF_OVERRIDES={shlex.quote(m.get("hf_overrides", ""))}')(produces |
||
| print(f'_HF_OVERRIDES=\"{m.get(\"hf_overrides\", \"\")}\"') | ||
| ")" | ||
|
|
||
| # ============================================================================= | ||
| # Cluster Topology Configuration | ||
| # ============================================================================= | ||
|
|
@@ -114,53 +131,48 @@ DECODE_ENABLE_DP="${DECODE_ENABLE_DP}" | |
| # Parallel args | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP | ||
| if [ "$PREFILL_ENABLE_DP" = "true" ]; then | ||
| if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #DPA+EP | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) | ||
| else #TP+DPA+TBO | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention --enable-tbo ) | ||
| export GPU_MAX_HW_QUEUES=5 | ||
| export ATOM_CPU_AFFINITY=1 | ||
| else #TP+DPA | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" --enable-dp-attention ) | ||
| fi | ||
| if [ "$PREFILL_ENABLE_EP" -gt 1 ]; then #EP+DPA | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_EP_DP_FLAGS}) | ||
| for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done | ||
| else #TP+DPA | ||
| PREFILL_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE" ${MODEL_TP_DP_FLAGS}) | ||
| for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done | ||
| fi | ||
| fi | ||
| fi | ||
|
|
||
| # (srok), split DPA & TBO cases | ||
| DECODE_PARALLEL_ARGS=(-tp "$PREFILL_TP_SIZE") #TP | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE") #TP | ||
| if [ "$DECODE_ENABLE_DP" = "true" ]; then | ||
| if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #DPA+EP | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-expert-parallel --enable-dp-attention ) | ||
| else #TP+DPA+TBO | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention --enable-tbo ) | ||
| export GPU_MAX_HW_QUEUES=5 | ||
| export ATOM_CPU_AFFINITY=1 | ||
| else #TP+DPA | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" --enable-dp-attention ) | ||
| fi | ||
| if [ "$DECODE_ENABLE_EP" -gt 1 ]; then #EP+DPA | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_EP_DP_FLAGS}) | ||
| for _dp_env_pair in ${MODEL_EP_DP_ENV}; do export "$_dp_env_pair"; done | ||
| else #TP+DPA | ||
| DECODE_PARALLEL_ARGS=(-tp "$DECODE_TP_SIZE" ${MODEL_TP_DP_FLAGS}) | ||
| for _dp_env_pair in ${MODEL_TP_DP_ENV}; do export "$_dp_env_pair"; done | ||
| fi | ||
| fi | ||
|
|
||
| # MTP args | ||
| SPEC_ARGS=() #TP | ||
| if [ "$SPEC_DECODING" = "mtp" ]; then | ||
| SPEC_ARGS=(--method mtp --num-speculative-tokens "$DECODE_MTP_SIZE") | ||
| fi | ||
| unset _dp_env_pair | ||
|
|
||
| # HF overrides (single-quoted JSON preserved through eval) | ||
| HF_OVERRIDES_ARG="" | ||
| if [[ "$MODEL_NAME" == "DeepSeek-V4-Pro" ]]; then | ||
| HF_OVERRIDES_ARG="--hf-overrides '{\"use_index_cache\":true,\"index_topk_freq\":4}'" | ||
| if [[ -n "$_HF_OVERRIDES" ]]; then | ||
| HF_OVERRIDES_ARG="--hf-overrides '${_HF_OVERRIDES}'" | ||
| fi | ||
| unset _HF_OVERRIDES | ||
|
|
||
| for _env_pair in ${MODEL_ENVS}; do | ||
| export "$_env_pair" | ||
| done | ||
| unset _env_pair | ||
|
|
||
| # KV cache dtype (skip if unset or 'auto') | ||
| KV_CACHE_ARG="" | ||
| if [[ -n "$KV_CACHE_DTYPE" && "$KV_CACHE_DTYPE" != "auto" ]]; then | ||
| KV_CACHE_ARG="--kv_cache_dtype ${KV_CACHE_DTYPE}" | ||
| # MTP args | ||
| SPEC_ARGS=() | ||
| if [ "$SPEC_DECODING" = "mtp" ]; then | ||
| SPEC_ARGS=(${MODEL_MTP_FLAGS} "$DECODE_MTP_SIZE") | ||
| fi | ||
|
|
||
| # KV cache arg - full flag string from YAML | ||
| KV_CACHE_ARG="${MODEL_KV_ARG}" | ||
|
|
||
| # Optional model length / batched-token cap | ||
| MODEL_LEN_ARGS="" | ||
| if [[ -n "$MAX_MODEL_LEN" ]]; then | ||
|
|
@@ -170,9 +182,6 @@ if [[ -n "$MAX_NUM_BATCHED_TOKENS" ]]; then | |
| MODEL_LEN_ARGS="${MODEL_LEN_ARGS} --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}" | ||
| fi | ||
|
|
||
| if [[ "$MODEL_NAME" != "DeepSeek-V4-Pro" ]]; then | ||
| export AITER_QUICK_REDUCE_QUANTIZATION=INT4 | ||
| fi | ||
|
|
||
| cat <<INFO | ||
| === Configuration === | ||
|
|
@@ -183,7 +192,7 @@ MODEL : ${MODEL_NAME} | |
| BACKEND : atom (PD mooncake KV transfer) | ||
| MTP : method=mtp num_speculative_tokens=${DECODE_MTP_SIZE} | ||
| xP/yD : ${xP} / ${yD} | ||
| KV cache : dtype=${KV_CACHE_DTYPE:-auto} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC} | ||
| KV cache : ${KV_CACHE_ARG:-none} block_size=${BLOCK_SIZE} mem_frac=${MEM_FRAC_STATIC} | ||
|
Comment on lines
193
to
+195
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Nit: line 193's INFO banner prints the literal string Extended reasoning...What\n\nIn
|
||
| Model len: max_model_len=${MAX_MODEL_LEN:-unset} max_num_batched_tokens=${MAX_NUM_BATCHED_TOKENS:-unset} | ||
| Prefill args : ${PREFILL_PARALLEL_ARGS[*]} | ||
| Decode args : ${DECODE_PARALLEL_ARGS[*]} | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.