Skip to content

Commit 9aa5ae2

Browse files
authored
Add DSv4-Pro FP4 GB200 SGLang disagg + MTP config (#1676)
* Add DSv4-Pro FP4 GB200 SGLang disagg + MTP config Initial submission of the MTP-decoded variant of the DSv4-Pro FP4 disagg GB200 SGLang config at 8k/1k. Eight prefill/decode topologies: two low-latency (1p1d-tp8-tp8, 1p6d-dep8-tp8) and six mid-curve points (1p1d through 6p1d-dep8-dep16). Each scenario sets `spec-decoding: "mtp"` so the matrix turns on the MTP speculative-decode path; chat template enabled accordingly. Image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85 * Update perf-changelog pr-link for #1676 * Use NVIDIA/srt-slurm:main for DSv4 SGLang clone (drop submission-branch pin) * Fix GB200 DSv4-Pro FP4 model path to shared Lustre root The dsv4/fp4 model path resolved to /mnt/numa1/models/deepseek-v4-pro/ (compute-node-local NVMe), which srtctl preflight reports as unavailable. Point both the dynamo-sglang and dynamo-vllm branches at the shared Lustre model root /mnt/lustre01/models/deepseek-v4-pro instead. * Update perf-changelog.yaml with new configurations
1 parent 22ea914 commit 9aa5ae2

11 files changed

Lines changed: 1259 additions & 4 deletions

.github/configs/nvidia-master.yaml

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9484,6 +9484,143 @@ qwen3.5-fp8-gb200-dynamo-sglang:
94849484
ep: 16
94859485
dp-attn: true
94869486

9487+
9488+
# MTP variant of dsv4-fp4-gb200-dynamo-sglang.
9489+
dsv4-fp4-gb200-dynamo-sglang-mtp:
9490+
image: lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85
9491+
model: deepseek-ai/DeepSeek-V4-Pro
9492+
model-prefix: dsv4
9493+
runner: gb200
9494+
precision: fp4
9495+
framework: dynamo-sglang
9496+
multinode: true
9497+
disagg: true
9498+
scenarios:
9499+
fixed-seq-len:
9500+
- isl: 8192
9501+
osl: 1024
9502+
search-space:
9503+
# Low-latency baseline: 1p1d-tp8-tp8. 4 nodes.
9504+
- spec-decoding: "mtp"
9505+
conc-list: [1]
9506+
prefill:
9507+
num-worker: 1
9508+
tp: 8
9509+
ep: 1
9510+
dp-attn: false
9511+
additional-settings:
9512+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p1d-tp8-tp8-mtp.yaml"
9513+
decode:
9514+
num-worker: 1
9515+
tp: 8
9516+
ep: 1
9517+
dp-attn: false
9518+
# Low-latency 1p6d-dep8-tp8: 1P (DEP=8) + 6 TP=8 decode workers. 14 nodes.
9519+
# Recipe runs concurrencies=32x64x128; matrix tracks the max.
9520+
- spec-decoding: "mtp"
9521+
conc-list: [128]
9522+
prefill:
9523+
num-worker: 1
9524+
tp: 8
9525+
ep: 8
9526+
dp-attn: true
9527+
additional-settings:
9528+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-low-latency-1p6d-dep8-tp8-mtp.yaml"
9529+
decode:
9530+
num-worker: 6
9531+
tp: 8
9532+
ep: 1
9533+
dp-attn: false
9534+
# Mid curve 1p1d-dep8-dep16. 6 nodes.
9535+
- spec-decoding: "mtp"
9536+
conc-list: [1024]
9537+
prefill:
9538+
num-worker: 1
9539+
tp: 8
9540+
ep: 8
9541+
dp-attn: true
9542+
additional-settings:
9543+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-1p1d-dep8-dep16-mtp.yaml"
9544+
decode:
9545+
num-worker: 1
9546+
tp: 16
9547+
ep: 16
9548+
dp-attn: true
9549+
# Mid curve 2p1d-dep8-dep16. 8 nodes.
9550+
- spec-decoding: "mtp"
9551+
conc-list: [2048]
9552+
prefill:
9553+
num-worker: 2
9554+
tp: 8
9555+
ep: 8
9556+
dp-attn: true
9557+
additional-settings:
9558+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-2p1d-dep8-dep16-mtp.yaml"
9559+
decode:
9560+
num-worker: 1
9561+
tp: 16
9562+
ep: 16
9563+
dp-attn: true
9564+
# Mid curve 3p1d-dep8-dep16. 10 nodes.
9565+
- spec-decoding: "mtp"
9566+
conc-list: [3072]
9567+
prefill:
9568+
num-worker: 3
9569+
tp: 8
9570+
ep: 8
9571+
dp-attn: true
9572+
additional-settings:
9573+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-3p1d-dep8-dep16-mtp.yaml"
9574+
decode:
9575+
num-worker: 1
9576+
tp: 16
9577+
ep: 16
9578+
dp-attn: true
9579+
# Mid curve 4p1d-dep8-dep16. 12 nodes.
9580+
- spec-decoding: "mtp"
9581+
conc-list: [6144]
9582+
prefill:
9583+
num-worker: 4
9584+
tp: 8
9585+
ep: 8
9586+
dp-attn: true
9587+
additional-settings:
9588+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-4p1d-dep8-dep16-mtp.yaml"
9589+
decode:
9590+
num-worker: 1
9591+
tp: 16
9592+
ep: 16
9593+
dp-attn: true
9594+
# Mid curve 5p1d-dep8-dep16. 14 nodes.
9595+
- spec-decoding: "mtp"
9596+
conc-list: [8192]
9597+
prefill:
9598+
num-worker: 5
9599+
tp: 8
9600+
ep: 8
9601+
dp-attn: true
9602+
additional-settings:
9603+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-5p1d-dep8-dep16-mtp.yaml"
9604+
decode:
9605+
num-worker: 1
9606+
tp: 16
9607+
ep: 16
9608+
dp-attn: true
9609+
# Mid curve 6p1d-dep8-dep16. 16 nodes.
9610+
- spec-decoding: "mtp"
9611+
conc-list: [16384]
9612+
prefill:
9613+
num-worker: 6
9614+
tp: 8
9615+
ep: 8
9616+
dp-attn: true
9617+
additional-settings:
9618+
- "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-gb200-mid-curve-6p1d-dep8-dep16-mtp.yaml"
9619+
decode:
9620+
num-worker: 1
9621+
tp: 16
9622+
ep: 16
9623+
dp-attn: true
94879624
dsv4-fp4-b300-dynamo-vllm:
94889625
image: vllm/vllm-openai:v0.20.1
94899626
model: deepseek-ai/DeepSeek-V4-Pro
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
name: "dsv4-pro-gb200-disagg-8k1k-low-latency-1p1d-tp8-tp8-mtp"
2+
3+
frontend:
4+
type: dynamo
5+
enable_multiple_frontends: true
6+
num_additional_frontends: 8
7+
8+
dynamo:
9+
hash: "92f5b3b8d7dd5ab9179d4b1034bd2c1c0803693e"
10+
install: true
11+
12+
model:
13+
path: "deepseek-v4-pro"
14+
container: "lmsysorg/sglang:nightly-dev-cu13-20260528-0abe6a85"
15+
precision: "fp4"
16+
17+
sbatch_directives:
18+
cpus-per-task: "144"
19+
mem: "0"
20+
21+
resources:
22+
gpu_type: "gb200"
23+
gpus_per_node: 4
24+
prefill_nodes: 2
25+
prefill_workers: 1
26+
gpus_per_prefill: 8
27+
decode_nodes: 2
28+
decode_workers: 1
29+
gpus_per_decode: 8
30+
31+
backend:
32+
type: sglang
33+
34+
prefill_environment:
35+
PYTHONUNBUFFERED: "1"
36+
SGLANG_RADIX_FORCE_MISS: "1"
37+
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
38+
SGLANG_DEFAULT_THINKING: "1"
39+
SGLANG_DSV4_REASONING_EFFORT: "max"
40+
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
41+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
42+
NCCL_MNNVL_ENABLE: "1"
43+
NCCL_CUMEM_ENABLE: "1"
44+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
45+
MC_FORCE_MNNVL: "1"
46+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
47+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
48+
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
49+
50+
decode_environment:
51+
PYTHONUNBUFFERED: "1"
52+
SGLANG_RADIX_FORCE_MISS: "1"
53+
SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1"
54+
SGLANG_DEFAULT_THINKING: "1"
55+
SGLANG_DSV4_REASONING_EFFORT: "max"
56+
SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1"
57+
SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1"
58+
NCCL_MNNVL_ENABLE: "1"
59+
NCCL_CUMEM_ENABLE: "1"
60+
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True"
61+
MC_FORCE_MNNVL: "1"
62+
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000"
63+
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000"
64+
SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1"
65+
66+
sglang_config:
67+
prefill:
68+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
69+
model-path: "/model/"
70+
trust-remote-code: true
71+
tool-call-parser: deepseekv4
72+
73+
disaggregation-mode: "prefill"
74+
disaggregation-transfer-backend: mooncake
75+
76+
tensor-parallel-size: 8
77+
data-parallel-size: 1
78+
expert-parallel-size: 1
79+
80+
moe-runner-backend: "flashinfer_mxfp4"
81+
disable-flashinfer-autotune: true
82+
83+
mem-fraction-static: 0.9
84+
max-running-requests: 16
85+
cuda-graph-max-bs: 8
86+
chunked-prefill-size: 65536
87+
88+
decode:
89+
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
90+
model-path: "/model/"
91+
trust-remote-code: true
92+
tool-call-parser: deepseekv4
93+
94+
disaggregation-mode: "decode"
95+
disaggregation-transfer-backend: mooncake
96+
97+
tensor-parallel-size: 8
98+
data-parallel-size: 1
99+
expert-parallel-size: 1
100+
101+
moe-runner-backend: "flashinfer_mxfp4"
102+
disable-flashinfer-autotune: true
103+
104+
speculative-algo: "EAGLE"
105+
speculative-num-steps: 3
106+
speculative-eagle-topk: 1
107+
speculative-num-draft-tokens: 4
108+
109+
mem-fraction-static: 0.9
110+
max-running-requests: 8
111+
cuda-graph-max-bs: 8
112+
swa-full-tokens-ratio: 0.1
113+
context-length: 16384
114+
115+
benchmark:
116+
type: "sa-bench"
117+
isl: 8192
118+
osl: 1024
119+
random_range_ratio: 0.8
120+
concurrencies: "1"
121+
req_rate: "inf"
122+
use_chat_template: true
123+
custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer"
124+

0 commit comments

Comments
 (0)