SemiAnalysisAI · functionstackx · Jun 24, 2026 · Jun 14, 2026 · Jun 15, 2026 · Jun 15, 2026
@@ -2748,3 +2748,170 @@ minimaxm3-fp8-mi325x-vllm-mtp:
       - { tp: 8, conc-start: 1, conc-end: 128, spec-decoding: mtp }
       - { tp: 8, ep: 8, conc-start: 256, conc-end: 256, spec-decoding: mtp }
       - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256, spec-decoding: mtp }
+
+# MiniMax-M3 MXFP8 MI355X vLLM disaggregated (prefill/decode) smoke test on the
+# day-zero ROCm image. Minimal 1 prefill (TP8) + 1 decode (TP8) at conc 1 to
+# validate the MoRI-IO KV-transfer disagg pipeline end-to-end for M3. Layered on
+# the MoRI-patch-removal infra (#1585). No EP (TP8 only); MoE experts are
+# TP-sharded as in the single-node M3 TP8 recipe. Per-worker serve flags live in
+# benchmarks/multi_node/amd_utils/models_vllm.yaml (MiniMax-M3-MXFP8).
+minimaxm3-fp8-mi355x-vllm-disagg:
+  image: vllm/vllm-openai-rocm:nightly-556bc4e3a089378e9df2482659898192da18db15
+  model: MiniMaxAI/MiniMax-M3-MXFP8
+  model-prefix: minimaxm3
+  runner: mi355x-disagg
+  precision: fp8
+  framework: vllm-disagg
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
+      # conc 1,2,4,8,16,32,64,128,256.
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
+      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
+      - spec-decoding: "none"
+        conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+    # 8k1k disagg sweep across four P/D layouts (1P TP8 + 1D TP8 conc 1..1024;
+    # 1P TP4 + 1D TP8 conc 1..256; 1P TP4 + 1D TP4 conc 64..1024; 2P TP4 + 1D TP8
+    # conc 256..1024). The multi-node eval policy (8k1k + conc >= 16) marks one
+    # lm-eval on the highest-max-conc layout (TP8+TP8, eval-conc=median=128) —
+    # validating the M3 MoRI-IO disagg pipeline's correctness end-to-end.
+    - isl: 8192
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Asymmetric 1P TP4 + 1D TP8 (smaller prefill, full-node decode) across
+      # conc 1,2,4,8,16,32,64,128,256.
+      - spec-decoding: "none"
+        conc-list: [ 1, 2, 4, 8, 16, 32, 64, 128, 256 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # Balanced half-node 1P TP4 + 1D TP4 at high conc 64,128,256,512,1024.
+      - spec-decoding: "none"
+        conc-list: [ 64, 128, 256, 512, 1024 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
+      # 2P TP4 + 1D TP8: two half-node TP4 prefill workers (PREFILL_NODES=2)
+      # feeding one full-node TP8 decode, at high conc 256,512,768,1024.
+      - spec-decoding: "none"
+        conc-list: [ 256, 512, 768, 1024 ]
+        prefill:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "DECODE_NODES=1"
diff --git a/benchmarks/multi_node/amd_utils/job.slurm b/benchmarks/multi_node/amd_utils/job.slurm
@@ -80,7 +80,6 @@ if [[ "${MORI_CONN_PATCH:-auto}" != "skip" ]] \
     export EXTRA_DOCKER_MOUNTS
     echo "[job.slurm] auto-applied MoRI conn.py overlay: ${_MORI_PATCH_FILE}"
 fi
-
 xP="${xP:-1}"
 yD="${yD:-1}"
 
@@ -315,8 +314,10 @@ export IS_MULTINODE="${IS_MULTINODE:-false}"
 SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
 export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
 
-# vLLM external router container
-VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260511-e667ebb}"
+# vLLM external router container.
+# NOTE: vllm/vllm-router only retains ~16 recent nightlies on Docker Hub; older
+# dated tags are garbage-collected (manifest unknown)
+VLLM_ROUTER_IMAGE="${VLLM_ROUTER_IMAGE:-vllm/vllm-router:nightly-20260617-e667ebb}"
 ROUTER_CONT_NAME="router_vllm_${SANITIZED_USER}_${SLURM_JOB_ID}"
 export RUN_FILE_FULL="$WS_PATH/${RUN_FILE}"
 
@@ -401,7 +402,6 @@ if [[ "$ENGINE" == "vllm-disagg" ]]; then
         -e UCX_LOG_LEVEL=warn
         -e HSA_ENABLE_SDMA=1
         -e PROXY_STREAM_IDLE_TIMEOUT=\${PROXY_STREAM_IDLE_TIMEOUT:-300}
-        -e VLLM_MORIIO_CONNECTOR_READ_MODE=\${VLLM_MORIIO_CONNECTOR_READ_MODE:-1}
         -e PYTHONPYCACHEPREFIX=/tmp/pycache
     )
 elif [[ "$ENGINE" == "atom-disagg" ]]; then

diff --git a/benchmarks/multi_node/amd_utils/models_vllm.yaml b/benchmarks/multi_node/amd_utils/models_vllm.yaml
@@ -26,19 +26,30 @@ amd-Llama-3.3-70B-Instruct-FP8-KV:
 
 Kimi-K2.5-MXFP4:
   prefill_flags: "--tensor-parallel-size 8 --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
-  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
+  decode_flags: "--tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --compilation-config '{\"cudagraph_mode\":\"PIECEWISE\"}' --no-enable-prefix-caching --block-size 1 --gpu-memory-utilization 0.90 --mm-encoder-tp-mode data"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_PAGED_ATTN=0 VLLM_ROCM_USE_AITER_RMSNORM=1 VLLM_USE_AITER_TRITON_SILU_MUL=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
   hf_dir: "models--amd--Kimi-K2.5-MXFP4"
 
 MiniMax-M2.5:
   # AITER fused-MoE kernel fmoe_bf16_blockscaleFp8_g1u1_vs_silu_32x384 for gfx950 writes OOB when run with MiniMax's shapes at M=8K(=num batched tokens), crashing vllm during AITER warmup.
   # Set token budget to 4k to avoid using that shape, instead of disabling AITER_MOE.
-  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
-  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  prefill_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
+  decode_flags: "--max-num-batched-tokens 4K --tensor-parallel-size 8 --enable-expert-parallel --all2all-backend mori_low_latency --no-enable-prefix-caching --gpu-memory-utilization 0.95 --block-size 32"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 VLLM_ENGINE_READY_TIMEOUT_S=3600 VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1"
   hf_dir: "models--MiniMaxAI--MiniMax-M2.5"
 
 gpt-oss-120b:
   prefill_flags: "--tensor-parallel-size 8"
   decode_flags: "--tensor-parallel-size 8"
   env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 VLLM_USE_AITER_UNIFIED_ATTENTION=1 VLLM_ROCM_USE_AITER_MHA=0 ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0"
+
+MiniMax-M3-MXFP8:
+  # MiniMax-M3 MXFP8 disagg, no EP. The --tensor-parallel-size 8 below is just a
+  # placeholder: server_vllm.sh sed-rewrites it to PREFILL_TP_SIZE/DECODE_TP_SIZE
+  # from the master-config prefill/decode tp (the sweep mixes TP8 and TP4 layouts).
+  # --block-size 128 is mandatory (MSA sparse/index cache); text-only benchmark
+  # so --language-model-only frees the vision encoder. gfx950 uses FP8 KV cache.
+  prefill_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"
+  decode_flags: "--tensor-parallel-size 8 --block-size 128 --language-model-only --kv-cache-dtype fp8 --attention-backend TRITON_ATTN --no-enable-prefix-caching --gpu-memory-utilization 0.90 --tool-call-parser minimax_m3 --reasoning-parser minimax_m3 --enable-auto-tool-choice"
+  env: "VLLM_USE_V1=1 VLLM_ROCM_USE_AITER=1 VLLM_USE_BREAKABLE_CUDAGRAPH=0 VLLM_ENGINE_READY_TIMEOUT_S=3600"
+  hf_dir: "models--MiniMaxAI--MiniMax-M3-MXFP8"
diff --git a/benchmarks/multi_node/amd_utils/patches/README.md b/benchmarks/multi_node/amd_utils/patches/README.md
@@ -1,16 +1,23 @@
-# In-tree sglang patches for the MoRI PD-disagg path
-
-This directory carries small Python overlays that get bind-mounted over
-the upstream sglang source inside the docker container at runtime.
-They are needed because some sglang releases ship known bugs in the
-MoRI disaggregation backend that block our benchmark + accuracy
-configs.
-
-The mount is wired through the `EXTRA_DOCKER_MOUNTS` env var that
-`job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after the
-existing `-v` block). The local-test driver scripts under
-`scripts/sglang_disagg/` pre-set this env var to the path of the
-relevant overlay; CI runners that need the patch can do the same.
+# In-tree patches for the MoRI / MoRIIO PD-disagg path
+
+This directory carries small overlays that fix up the engine source inside
+the docker container at runtime. They are needed because some published
+images ship known bugs in the (MoRI / MoRIIO) disaggregation backend that
+block our benchmark + accuracy configs — so we can keep reusing the
+**stock image** instead of rebuilding a patched one.
+
+- `mori_conn.py` — single-file overlay (bind-mounted) for the **sglang**
+  MoRI backend.
+
+> Note: the vLLM MoRIIO `minimax-m3` overlay (`moriio/`) was retired once the
+> upstream fixes (vLLM #46039 / #46290 / #46332) shipped in the ROCm nightly
+> image; `minimaxm3-fp8-mi355x-vllm-disagg` now runs the stock nightly directly.
+
+The `mori_conn.py` overlay is wired through the `EXTRA_DOCKER_MOUNTS` env
+var that `job.slurm` consumes (an opt-in `${EXTRA_DOCKER_MOUNTS:-}` after
+the existing `-v` block). The local-test driver scripts under
+`scripts/sglang_disagg/` pre-set this env var to the path of the relevant
+overlay; CI runners that need the patch can do the same.
 
 ## `mori_conn.py`
 

diff --git a/benchmarks/multi_node/amd_utils/server_vllm.sh b/benchmarks/multi_node/amd_utils/server_vllm.sh
@@ -256,7 +256,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${PREFILL_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
@@ -422,7 +422,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$xP" ]; then
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_producer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${PREFILL_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then
@@ -478,7 +478,7 @@ else
         --served-model-name ${SERVED_MODEL} \
         --port $SERVER_PORT \
         --trust-remote-code \
-        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\"}}' \
+        --kv-transfer-config '{\"kv_connector\": \"MoRIIOConnector\", \"kv_role\": \"kv_consumer\", \"kv_connector_extra_config\": {\"proxy_ip\": \"${NODE0_ADDR}\", \"proxy_ping_port\": \"${PROXY_PING_PORT}\", \"http_port\": \"${SERVER_PORT}\", \"read_mode\": true}}' \
         ${DECODE_SERVER_CONFIG}"
 
     if [[ "$DRY_RUN" -eq 1 ]]; then