sgl-project · faradawn · Jun 5, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 22, 2026
@@ -95,7 +95,7 @@ DeepSeek-R1 shares the same MoE architecture as DeepSeek-V3, so the same hardwar
 python3 -m sglang.compile_deep_gemm --model deepseek-ai/DeepSeek-R1 --tp 8 --trust-remote-code
 ```
 
-**Data Parallelism Attention (`--enable-dp-attention`):** Recommended for high-throughput scenarios. Use `--enable-dp-attention --tp 8 --dp 8` on a single 8-GPU node.
+**Data Parallelism Attention (`--enable-dp-attention`):** For B200 NVFP4, enable DP Attention for best performance with `--tensor-parallel-size <TP>`, `--data-parallel-size <TP>`, `--enable-dp-attention`, `--enable-dp-attention-local-control-broadcast`, and `--enable-dp-lm-head`.
 
 **NCCL timeout:** If model loading is slow, increase: `--dist-timeout 3600`.
 

@@ -82,7 +82,11 @@ const lookupData = {
       "scenario": "low-latency",
       "parameters": {
         "model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
+        "quantization": "modelopt_fp4",
         "tensor_parallel_size": 4,
+        "attention_backend": "trtllm_mla",
+        "moe_runner_backend": "flashinfer_trtllm",
+        "enable_flashinfer_allreduce_fusion": true,
         "cuda_graph_max_bs": 256,
         "max_running_requests": 256,
         "mem_fraction_static": 0.85,
@@ -99,7 +103,11 @@ const lookupData = {
       "scenario": "high-throughput",
       "parameters": {
         "model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
+        "quantization": "modelopt_fp4",
         "tensor_parallel_size": 4,
+        "attention_backend": "trtllm_mla",
+        "moe_runner_backend": "flashinfer_trtllm",
+        "enable_flashinfer_allreduce_fusion": true,
         "cuda_graph_max_bs": 256,
         "max_running_requests": 256,
         "mem_fraction_static": 0.85,
@@ -116,7 +124,11 @@ const lookupData = {
       "scenario": "low-latency",
       "parameters": {
         "model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
+        "quantization": "modelopt_fp4",
         "tensor_parallel_size": 8,
+        "attention_backend": "trtllm_mla",
+        "moe_runner_backend": "flashinfer_trtllm",
+        "enable_flashinfer_allreduce_fusion": true,
         "cuda_graph_max_bs": 256,
         "max_running_requests": 256,
         "mem_fraction_static": 0.85,
@@ -135,7 +147,11 @@ const lookupData = {
       "scenario": "high-throughput",
       "parameters": {
         "model_path": "nvidia/DeepSeek-R1-0528-FP4-v2",
+        "quantization": "modelopt_fp4",
         "tensor_parallel_size": 8,
+        "attention_backend": "trtllm_mla",
+        "moe_runner_backend": "flashinfer_trtllm",
+        "enable_flashinfer_allreduce_fusion": true,
         "cuda_graph_max_bs": 256,
         "max_running_requests": 256,
         "mem_fraction_static": 0.85,

@@ -102,6 +102,7 @@ export const DeepSeekR1BasicDeployment = () => {
     }
 
     const isXeon = hardware === 'xeon';
+    const enableB200Fp4Dp = hardware === 'b200' && quantization === 'fp4' && strategyValues.includes('dp');
     const modelPath =
       quantization === 'fp4'
         ? 'nvidia/DeepSeek-R1-0528-FP4-v2'
@@ -113,10 +114,22 @@ export const DeepSeekR1BasicDeployment = () => {
     command += `  --model-path ${modelPath}`;
 
     if (strategyValues.includes('tp')) {
-      command += isXeon ? ' \\\n  --tp 6' : ' \\\n  --tp 8';
+      if (enableB200Fp4Dp) {
+        command += ' \\\n  --tensor-parallel-size 8';
+      } else {
+        command += isXeon ? ' \\\n  --tp 6' : ' \\\n  --tp 8';
+      }
     }
     if (strategyValues.includes('dp')) {
-      command += ' \\\n  --dp 8 \\\n  --enable-dp-attention';
+      if (enableB200Fp4Dp) {
+        command +=
+          ' \\\n  --data-parallel-size 8' +
+          ' \\\n  --enable-dp-attention' +
+          ' \\\n  --enable-dp-attention-local-control-broadcast' +
+          ' \\\n  --enable-dp-lm-head';
+      } else {
+        command += ' \\\n  --dp 8 \\\n  --enable-dp-attention';
+      }
     }
     if (strategyValues.includes('ep')) {
       command += ' \\\n  --ep 8';