Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ DeepSeek-R1 shares the same MoE architecture as DeepSeek-V3, so the same hardwar
python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-R1 --tp 8 --trust-remote-code
```

**Data Parallelism Attention (`--enable-dp-attention`):** Recommended for high-throughput scenarios. Use `--enable-dp-attention --tp 8 --dp 8` on a single 8-GPU node.
**Data Parallelism Attention (`--enable-dp-attention`):** For B200 NVFP4, enable DP Attention for best performance with `--tensor-parallel-size <TP>`, `--data-parallel-size <TP>`, `--enable-dp-attention`, `--enable-dp-attention-local-control-broadcast`, and `--enable-dp-lm-head`.

**NCCL timeout:** If model loading is slow, increase: `--dist-timeout 3600`.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ const lookupData = {
"scenario": "low-latency",
"parameters": {
"model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
"quantization": "modelopt_fp4",
"tensor_parallel_size": 4,
"attention_backend": "trtllm_mla",
"moe_runner_backend": "flashinfer_trtllm",
"enable_flashinfer_allreduce_fusion": true,
"cuda_graph_max_bs": 256,
"max_running_requests": 256,
"mem_fraction_static": 0.85,
Expand All @@ -99,7 +103,11 @@ const lookupData = {
"scenario": "high-throughput",
"parameters": {
"model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
"quantization": "modelopt_fp4",
"tensor_parallel_size": 4,
"attention_backend": "trtllm_mla",
"moe_runner_backend": "flashinfer_trtllm",
"enable_flashinfer_allreduce_fusion": true,
"cuda_graph_max_bs": 256,
"max_running_requests": 256,
"mem_fraction_static": 0.85,
Expand All @@ -116,7 +124,11 @@ const lookupData = {
"scenario": "low-latency",
"parameters": {
"model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
"quantization": "modelopt_fp4",
"tensor_parallel_size": 8,
"attention_backend": "trtllm_mla",
"moe_runner_backend": "flashinfer_trtllm",
"enable_flashinfer_allreduce_fusion": true,
"cuda_graph_max_bs": 256,
"max_running_requests": 256,
"mem_fraction_static": 0.85,
Expand All @@ -135,7 +147,11 @@ const lookupData = {
"scenario": "high-throughput",
"parameters": {
"model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
"quantization": "modelopt_fp4",
"tensor_parallel_size": 8,
"attention_backend": "trtllm_mla",
"moe_runner_backend": "flashinfer_trtllm",
"enable_flashinfer_allreduce_fusion": true,
"cuda_graph_max_bs": 256,
"max_running_requests": 256,
"mem_fraction_static": 0.85,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ export const DeepSeekR1BasicDeployment = () => {
}

const isXeon = hardware === 'xeon';
const enableB200Fp4Dp = hardware === 'b200' && quantization === 'fp4' && strategyValues.includes('dp');
const modelPath =
quantization === 'fp4'
? 'nvidia/DeepSeek-R1-0528-FP4-v2'
Expand All @@ -113,10 +114,22 @@ export const DeepSeekR1BasicDeployment = () => {
command += ` --model-path ${modelPath}`;

if (strategyValues.includes('tp')) {
command += isXeon ? ' \\\n --tp 6' : ' \\\n --tp 8';
if (enableB200Fp4Dp) {
command += ' \\\n --tensor-parallel-size 8';
} else {
command += isXeon ? ' \\\n --tp 6' : ' \\\n --tp 8';
}
}
if (strategyValues.includes('dp')) {
command += ' \\\n --dp 8 \\\n --enable-dp-attention';
if (enableB200Fp4Dp) {
command +=
' \\\n --data-parallel-size 8' +
' \\\n --enable-dp-attention' +
' \\\n --enable-dp-attention-local-control-broadcast' +
' \\\n --enable-dp-lm-head';
} else {
command += ' \\\n --dp 8 \\\n --enable-dp-attention';
}
}
if (strategyValues.includes('ep')) {
command += ' \\\n --ep 8';
Expand Down
Loading