Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1670,7 +1670,7 @@ dsr1-fp8-b300-dynamo-trt:
ep: 8
dp-attn: true
dsr1-fp4-b200-sglang:
image: lmsysorg/sglang:v0.5.12-cu130
image: lmsysorg/sglang:v0.5.12.post1
model: nvidia/DeepSeek-R1-0528-FP4-V2
model-prefix: dsr1
runner: b200
Expand All @@ -1682,13 +1682,14 @@ dsr1-fp4-b200-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 128 }
- { tp: 4, ep: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, ep: 8, conc-list: [1] }
Comment thread
cquil11 marked this conversation as resolved.
- isl: 8192
osl: 1024
search-space:
- { tp: 4, ep: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 16 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
- { tp: 8, ep: 8, conc-list: [1] }
# agentic-coding: temporarily disabled — blocked by e2e-tests.yml artifact
# name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads as
# `bmk_agentic_*`). Re-enable once that workflow is aligned.
Expand Down
47 changes: 40 additions & 7 deletions benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

source "$(dirname "$0")/../../benchmark_lib.sh"

DP_ATTENTION="${DP_ATTENTION:-false}"

check_env_vars \
MODEL \
TP \
Expand All @@ -10,7 +12,13 @@ check_env_vars \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE
EP_SIZE \
DP_ATTENTION

if [[ "$DP_ATTENTION" != "true" && "$DP_ATTENTION" != "false" ]]; then
echo "DP_ATTENTION must be true or false; got '$DP_ATTENTION'" >&2
exit 1
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -22,13 +30,38 @@ nvidia-smi

SERVER_LOG=/workspace/server.log

# Default: recv every ~10 requests; if CONC 16, relax to ~30 requests between scheduler recv polls.
# Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls.
if [[ $CONC -ge 16 ]]; then
SCHEDULER_RECV_INTERVAL=30
else
SCHEDULER_RECV_INTERVAL=10
fi
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"

CHUNKED_PREFILL_SIZE=16384
SGLANG_PARALLEL_ARGS=(
--tensor-parallel-size="$TP"
--data-parallel-size=1
)
SGLANG_DPA_ARGS=()

if [[ "$DP_ATTENTION" == "true" ]]; then
SCHEDULER_RECV_INTERVAL=1
CHUNKED_PREFILL_SIZE=32768
SGLANG_PARALLEL_ARGS=(
--tensor-parallel-size="$TP"
--data-parallel-size="$TP"
--enable-dp-attention
--enable-dp-attention-local-control-broadcast

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sgalng cookbook

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to SGLang cookbook: https://github.com/sgl-project/sglang/pull/28954/changes

command +=
          ' \\\n  --data-parallel-size 8' +
          ' \\\n  --enable-dp-attention' +
          ' \\\n  --enable-dp-attention-local-control-broadcast' +
          ' \\\n  --enable-dp-lm-head';
      }

Please let me know if there is any issue

--enable-dp-lm-head
)
SGLANG_DPA_ARGS=(
--schedule-conservativeness 3.33
--enable-prefill-delayer
Comment thread
cquil11 marked this conversation as resolved.
)
fi

echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -39,12 +72,12 @@ fi
start_gpu_monitor

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
--tensor-parallel-size=$TP --data-parallel-size=1 \
SGLANG_RADIX_FORCE_MISS=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
"${SGLANG_PARALLEL_ARGS[@]}" \
--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size 16384 \
--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
Comment thread
Ankur-singh marked this conversation as resolved.
--enable-symm-mem --disable-piecewise-cuda-graph --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 "${SGLANG_DPA_ARGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4115,3 +4115,12 @@
- "Use the dedicated ARM64 MiniMax-M3 performance image; benchmark settings unchanged"
- "Allocate FlashInfer MNNVL workspace for one-shot TP8 all-reduce during CUDA graph capture"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1897

- config-keys:
- dsr1-fp4-b200-sglang
description:
- "Update B200 FP4 SGLang image from lmsysorg/sglang:v0.5.12-cu130 to lmsysorg/sglang:v0.5.12.post1."
- "Update search space: 1k/1k TP4/EP4 conc 1-256 and TP8/EP8 conc 1; 8k/1k TP4/EP4 conc 1-128, add TP4/EP4 DP-attention conc 64-256, and keep TP8/EP8 conc 1."
- "For DP-attention runs, use TP-sized data parallelism with DP attention local-control broadcast, DP LM head, prefill delayer, scheduler recv interval 1, chunked prefill size 32768, and schedule conservativeness 3.33."
- "Set SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache, and explicitly pass --disable-piecewise-cuda-graph."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1792