Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1670,7 +1670,7 @@ dsr1-fp8-b300-dynamo-trt:
ep: 8
dp-attn: true
dsr1-fp4-b200-sglang:
image: lmsysorg/sglang:v0.5.12-cu130
image: lmsysorg/sglang:v0.5.12.post1
model: nvidia/DeepSeek-R1-0528-FP4-V2
model-prefix: dsr1
runner: b200
Expand All @@ -1682,13 +1682,14 @@ dsr1-fp4-b200-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 128 }
- { tp: 4, ep: 4, conc-start: 1, conc-end: 256 }
- { tp: 8, ep: 8, conc-list: [1] }
Comment thread
cquil11 marked this conversation as resolved.
- isl: 8192
osl: 1024
search-space:
- { tp: 4, ep: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 16 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
- { tp: 8, ep: 8, conc-list: [1] }
# agentic-coding: temporarily disabled — blocked by e2e-tests.yml artifact
# name mismatch (downloads `agentic_*` but benchmark-tmpl.yml uploads as
# `bmk_agentic_*`). Re-enable once that workflow is aligned.
Expand Down
47 changes: 40 additions & 7 deletions benchmarks/single_node/fixed_seq_len/dsr1_fp4_b200.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

source "$(dirname "$0")/../../benchmark_lib.sh"

DP_ATTENTION="${DP_ATTENTION:-false}"

check_env_vars \
MODEL \
TP \
Expand All @@ -10,7 +12,13 @@ check_env_vars \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE
EP_SIZE \
DP_ATTENTION

if [[ "$DP_ATTENTION" != "true" && "$DP_ATTENTION" != "false" ]]; then
echo "DP_ATTENTION must be true or false; got '$DP_ATTENTION'" >&2
exit 1
fi

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -22,13 +30,38 @@ nvidia-smi

SERVER_LOG=/workspace/server.log

# Default: recv every ~10 requests; if CONC 16, relax to ~30 requests between scheduler recv polls.
# Default: recv every ~10 requests; if CONC >= 16, relax to ~30 requests between scheduler recv polls.
if [[ $CONC -ge 16 ]]; then
SCHEDULER_RECV_INTERVAL=30
else
SCHEDULER_RECV_INTERVAL=10
fi
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"

CHUNKED_PREFILL_SIZE=16384
SGLANG_PARALLEL_ARGS=(
--tensor-parallel-size="$TP"
--data-parallel-size=1
)
SGLANG_DPA_ARGS=()

if [[ "$DP_ATTENTION" == "true" ]]; then
SCHEDULER_RECV_INTERVAL=1
CHUNKED_PREFILL_SIZE=32768
SGLANG_PARALLEL_ARGS=(
--tensor-parallel-size="$TP"
--data-parallel-size="$TP"
--enable-dp-attention
--enable-dp-attention-local-control-broadcast

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sgalng cookbook

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added to SGLang cookbook: https://github.com/sgl-project/sglang/pull/28954/changes

command +=
          ' \\\n  --data-parallel-size 8' +
          ' \\\n  --enable-dp-attention' +
          ' \\\n  --enable-dp-attention-local-control-broadcast' +
          ' \\\n  --enable-dp-lm-head';
      }

Please let me know if there is any issue

--enable-dp-lm-head
)
SGLANG_DPA_ARGS=(
--schedule-conservativeness 3.33
--enable-prefill-delayer
Comment thread
cquil11 marked this conversation as resolved.
)
fi

echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CHUNKED_PREFILL_SIZE: $CHUNKED_PREFILL_SIZE"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
Expand All @@ -39,12 +72,12 @@ fi
start_gpu_monitor

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
--tensor-parallel-size=$TP --data-parallel-size=1 \
SGLANG_RADIX_FORCE_MISS=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
"${SGLANG_PARALLEL_ARGS[@]}" \
--cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
--chunked-prefill-size 16384 \
--chunked-prefill-size "$CHUNKED_PREFILL_SIZE" \
--ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
--enable-symm-mem --disable-radix-cache --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
Comment thread
Ankur-singh marked this conversation as resolved.
--enable-symm-mem --disable-piecewise-cuda-graph --attention-backend trtllm_mla --moe-runner-backend flashinfer_trtllm --stream-interval 10 "${SGLANG_DPA_ARGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4115,3 +4115,12 @@
- "Use the dedicated ARM64 MiniMax-M3 performance image; benchmark settings unchanged"
- "Allocate FlashInfer MNNVL workspace for one-shot TP8 all-reduce during CUDA graph capture"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1897

- config-keys:
- dsr1-fp4-b200-sglang
description:
- "Update B200 FP4 SGLang image from lmsysorg/sglang:v0.5.12-cu130 to lmsysorg/sglang:v0.5.12.post1."
- "Update search space: 1k/1k TP4/EP4 conc 1-256 and TP8/EP8 conc 1; 8k/1k TP4/EP4 conc 1-128, add TP4/EP4 DP-attention conc 64-256, and keep TP8/EP8 conc 1."
- "For DP-attention runs, use TP-sized data parallelism with DP attention local-control broadcast, DP LM head, prefill delayer, scheduler recv interval 1, chunked prefill size 32768, and schedule conservativeness 3.33."
- "Set SGLANG_RADIX_FORCE_MISS=1, remove --disable-radix-cache, and explicitly pass --disable-piecewise-cuda-graph."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1792