Skip to content

Commit 3d714a9

Browse files
authored
Merge branch 'main' into amd/m3_atom_pd_fp8_0623
2 parents 872f3ff + 9d59b9e commit 3d714a9

7 files changed

Lines changed: 29 additions & 22 deletions

File tree

.github/configs/nvidia-master.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9014,7 +9014,7 @@ kimik2.5-fp4-gb200-dynamo-vllm:
90149014
dp-attn: true
90159015

90169016
dsv4-fp4-b200-dynamo-vllm:
9017-
image: vllm/vllm-openai:v0.20.1
9017+
image: vllm/vllm-openai:v0.23.0
90189018
model: deepseek-ai/DeepSeek-V4-Pro
90199019
model-prefix: dsv4
90209020
runner: b200-multinode

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-high-tpt-megamoe.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-high-tpt-megamoe"
1717
# absorb cold-cache model loads.
1818
model:
1919
path: "deepseek-v4-pro"
20-
container: "vllm/vllm-openai:v0.20.1"
20+
container: "vllm/vllm-openai:v0.23.0"
2121
precision: "fp4"
2222

2323
dynamo:
@@ -83,13 +83,13 @@ backend:
8383
enforce-eager: true
8484
max-model-len: 9280
8585
max-num-seqs: 16
86-
max-num-batched-tokens: 32768
86+
max-num-batched-tokens: 16384
8787
trust-remote-code: true
8888
no-enable-prefix-caching: true
8989
no-enable-flashinfer-autotune: true
9090
no-async-scheduling: true
9191
block-size: 256
92-
gpu-memory-utilization: 0.95
92+
gpu-memory-utilization: 0.9
9393
no-disable-hybrid-kv-cache-manager: true
9494
enable-sleep-mode: true
9595
numa-bind: true
@@ -132,7 +132,7 @@ identity:
132132
repo: "deepseek-ai/DeepSeek-V4-Pro"
133133
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
134134
container:
135-
image: "vllm/vllm-openai:v0.20.1"
135+
image: "vllm/vllm-openai:v0.23.0"
136136
frameworks:
137137
dynamo: "1.2.0.dev20260426"
138-
vllm: "0.20.0"
138+
vllm: "0.23.0"

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-latency.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-low-latency"
1717
# absorb cold-cache model loads.
1818
model:
1919
path: "deepseek-v4-pro"
20-
container: "vllm/vllm-openai:v0.20.1"
20+
container: "vllm/vllm-openai:v0.23.0"
2121
precision: "fp4"
2222

2323
dynamo:
@@ -131,7 +131,7 @@ benchmark:
131131

132132
identity:
133133
container:
134-
image: "vllm/vllm-openai:v0.20.1"
134+
image: "vllm/vllm-openai:v0.23.0"
135135
frameworks:
136136
dynamo: "1.2.0.dev20260426"
137-
vllm: "0.20.0"
137+
vllm: "0.23.0"

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-low-middle-curve.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-low-middle-curve"
1717
# absorb cold-cache model loads.
1818
model:
1919
path: "deepseek-v4-pro"
20-
container: "vllm/vllm-openai:v0.20.1"
20+
container: "vllm/vllm-openai:v0.23.0"
2121
precision: "fp4"
2222

2323
dynamo:
@@ -132,7 +132,7 @@ benchmark:
132132

133133
identity:
134134
container:
135-
image: "vllm/vllm-openai:v0.20.1"
135+
image: "vllm/vllm-openai:v0.23.0"
136136
frameworks:
137137
dynamo: "1.2.0.dev20260426"
138-
vllm: "0.20.0"
138+
vllm: "0.23.0"

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-max-tpt-megamoe.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-max-tpt-megamoe"
1717
# absorb cold-cache model loads.
1818
model:
1919
path: "deepseek-v4-pro"
20-
container: "vllm/vllm-openai:v0.20.1"
20+
container: "vllm/vllm-openai:v0.23.0"
2121
precision: "fp4"
2222

2323
dynamo:
@@ -83,13 +83,13 @@ backend:
8383
enforce-eager: true
8484
max-model-len: 9280
8585
max-num-seqs: 16
86-
max-num-batched-tokens: 32768
86+
max-num-batched-tokens: 16384
8787
trust-remote-code: true
8888
no-enable-prefix-caching: true
8989
no-enable-flashinfer-autotune: true
9090
no-async-scheduling: true
9191
block-size: 256
92-
gpu-memory-utilization: 0.95
92+
gpu-memory-utilization: 0.9
9393
no-disable-hybrid-kv-cache-manager: true
9494
enable-sleep-mode: true
9595
numa-bind: true
@@ -132,7 +132,7 @@ identity:
132132
repo: "deepseek-ai/DeepSeek-V4-Pro"
133133
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
134134
container:
135-
image: "vllm/vllm-openai:v0.20.1"
135+
image: "vllm/vllm-openai:v0.23.0"
136136
frameworks:
137137
dynamo: "1.2.0.dev20260426"
138-
vllm: "0.20.0"
138+
vllm: "0.23.0"

benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4/8k1k/disagg-b200-mid-curve-megamoe.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ name: "svf-vllm-disagg-b200-mid-curve-megamoe"
1717
# absorb cold-cache model loads.
1818
model:
1919
path: "deepseek-v4-pro"
20-
container: "vllm/vllm-openai:v0.20.1"
20+
container: "vllm/vllm-openai:v0.23.0"
2121
precision: "fp4"
2222

2323
dynamo:
@@ -83,13 +83,13 @@ backend:
8383
enforce-eager: true
8484
max-model-len: 9280
8585
max-num-seqs: 16
86-
max-num-batched-tokens: 32768
86+
max-num-batched-tokens: 16384
8787
trust-remote-code: true
8888
no-enable-prefix-caching: true
8989
no-enable-flashinfer-autotune: true
9090
no-async-scheduling: true
9191
block-size: 256
92-
gpu-memory-utilization: 0.95
92+
gpu-memory-utilization: 0.9
9393
no-disable-hybrid-kv-cache-manager: true
9494
enable-sleep-mode: true
9595
numa-bind: true
@@ -132,7 +132,7 @@ identity:
132132
repo: "deepseek-ai/DeepSeek-V4-Pro"
133133
revision: "0366e4e064385807ea86b088a5c6c878ff23343b"
134134
container:
135-
image: "vllm/vllm-openai:v0.20.1"
135+
image: "vllm/vllm-openai:v0.23.0"
136136
frameworks:
137137
dynamo: "1.2.0.dev20260426"
138-
vllm: "0.20.0"
138+
vllm: "0.23.0"

perf-changelog.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4191,3 +4191,10 @@
41914191
- "models_atom.yaml: add MiniMax-M3-MXFP4 and MiniMax-M3-MXFP8 entries with EAGLE3 MTP flags; add DeepSeek-V4-Pro with TBO/cpu-affinity TP+DPA env and MTP flags; add tp_dp_flags, ep_dp_flags, tp_dp_env, ep_dp_env, kv_cache_flags, mtp_flags, hf_overrides fields"
41924192
- "Image bump for minimaxm3-fp8-mi355x-atom-disagg: rocm/atom-dev:MiniMax-M3-20260622 -> rocm/atom-dev:MiniMax-M3-20260623"
41934193
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1930
4194+
4195+
- config-keys:
4196+
- dsv4-fp4-b200-dynamo-vllm
4197+
description:
4198+
- "Update the DeepSeek-V4-Pro B200 disaggregated Dynamo-vLLM benchmark to the vllm/vllm-openai:v0.23.0 image"
4199+
- "Lower max-num-batched-tokens to 16384 and gpu-memory-utilization to 0.9 on the high-throughput and max-throughput recipes to avoid OOM"
4200+
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1899

0 commit comments

Comments
 (0)