-
Notifications
You must be signed in to change notification settings - Fork 208
[DO NOT MERGE] Run-only: gb200 dsr1 measured power+temp (canonical NVIDIA) #1791
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 13 commits
b075fde
8f8ba55
8f8a909
33acb9b
a4a7a32
47d8929
a111370
8ad9cad
a4f9eaf
d693306
cd680da
f5713be
d6ace77
f9bda03
94c2add
80552d7
c8d2e5d
35e02d1
76bbfcb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -128,11 +128,19 @@ if [[ "$IS_MULTINODE" == "true" ]]; then | |
| mkdir -p recipes/sglang/glm5/b200-fp8 | ||
| cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8" recipes/sglang/glm5/b200-fp8 | ||
| elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then | ||
| git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" | ||
| # MEASURED-POWER CAMPAIGN (dsr1-disagg-NVIDIA): clone the SemiAnalysisAI | ||
| # perfmon fork (NVIDIA/srt-slurm PR #35, feat/inferencex-perfmon) instead | ||
| # of NVIDIA/srt-slurm@main. The fork ships the SAME b200 dsr1 recipe this | ||
| # config points at (recipes/b200-fp4/1k1k.yaml) PLUS the per-node | ||
| # nvidia-smi perfmon machinery (src/srtctl/monitor/perfmon.py) that writes | ||
| # perf_samples_*.csv when a recipe declares `monitoring:`. That is what | ||
| # turns the b200 dsr1-disagg energy charts MEASURED. The recipe lives in | ||
| # the fork directly, so the prior mkdir/cp into recipes/sglang/dsr1/b200-fp4 | ||
| # (a different path, unused by this config's CONFIG_FILE) is dropped. | ||
| git clone https://github.com/SemiAnalysisAI/srt-slurm.git "$SRT_REPO_DIR" | ||
| cd "$SRT_REPO_DIR" || exit 1 | ||
| git checkout main | ||
| mkdir -p recipes/sglang/dsr1/b200-fp4 | ||
| cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4" recipes/sglang/dsr1/b200-fp4 | ||
| git checkout feat/inferencex-perfmon | ||
| export PERFMON_ENABLED=1 | ||
| else | ||
| git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" | ||
| cd "$SRT_REPO_DIR" || exit 1 | ||
|
|
@@ -259,6 +267,23 @@ EOF | |
| # so large-model loads (e.g. DSR1-FP8 ~680GB off shared FS) finish in time. | ||
| # Uses ${CONFIG_FILE%%:*} because CONFIG_FILE may carry an :override[N] suffix. | ||
| sed -i 's/^ max_attempts: [0-9]*/ max_attempts: 720/' "${CONFIG_FILE%%:*}" | ||
| # MEASURED-POWER CAMPAIGN: enable per-node nvidia-smi perfmon and give the | ||
| # slow dsr1 bring-up headroom. This recipe is `base:` + `zip_override_*`, so | ||
| # `monitoring:` must be inserted INTO base: (2-space indent) to survive the | ||
| # override zip — a top-level EOF append (the flat-recipe trick) is ignored | ||
| # here. health_check lives at base.health_check.max_attempts (4-space), which | ||
| # the 2-space sed above does NOT match, so bump it here (360->540, i.e. | ||
| # 60min->90min, for the ~35min dsr1 warmup). Guarded on PERFMON_ENABLED so | ||
| # only the perfmon-fork dsr1 path is affected. | ||
| if [ "${PERFMON_ENABLED:-0}" = "1" ]; then | ||
| CFG="${CONFIG_FILE%%:*}" | ||
| if [ -f "$CFG" ] && ! grep -q '^[[:space:]]*monitoring:' "$CFG"; then | ||
| awk '/^base:/{print; print " monitoring:"; print " enabled: true"; print " sample_interval: 1.0"; next} {print}' "$CFG" > "$CFG.perfmon.tmp" && mv "$CFG.perfmon.tmp" "$CFG" | ||
| echo "[perfmon] injected monitoring: under base: in $CFG" | ||
| fi | ||
| sed -i 's/^ max_attempts: [0-9]*/ max_attempts: 540/' "$CFG" | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. b200 sed overwrites all retriesLow Severity Under Reviewed by Cursor Bugbot for commit 35e02d1. Configure here. |
||
| echo "[perfmon] set base.health_check.max_attempts=540 (90min) in $CFG" | ||
| fi | ||
| SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) | ||
| echo "$SRTCTL_OUTPUT" | ||
|
|
||
|
|
@@ -384,6 +409,25 @@ EOF | |
| fi | ||
| fi | ||
|
|
||
| # MEASURED-POWER CAMPAIGN: stage per-node perfmon CSVs for the downstream | ||
| # "Process result" step BEFORE the outputs cleanup below deletes them. | ||
| # perfmon writes perf_samples_*.csv under the per-node job logs; copy them | ||
| # into $GITHUB_WORKSPACE and export GPU_METRICS_CSV_GLOB so aggregate_power.py | ||
| # emits measured power + temp/util/mem. b200-dgxc logs live on shared Lustre | ||
| # (LOGS_DIR is populated by the compute nodes), so a recursive find picks up | ||
| # every node's CSV. Guarded on PERFMON_ENABLED (dsr1 perfmon-fork path only). | ||
| if [ "${PERFMON_ENABLED:-0}" = "1" ] && [ -d "$LOGS_DIR" ]; then | ||
| if find "$LOGS_DIR" -name 'perf_samples_*.csv' 2>/dev/null | grep -q .; then | ||
| mkdir -p "$GITHUB_WORKSPACE/perf_samples" | ||
| find "$LOGS_DIR" -name 'perf_samples_*.csv' -exec cp {} "$GITHUB_WORKSPACE/perf_samples/" \; | ||
| perf_csv_count=$(ls "$GITHUB_WORKSPACE/perf_samples"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ') | ||
| echo "GPU_METRICS_CSV_GLOB=$GITHUB_WORKSPACE/perf_samples/perf_samples_*.csv" >> "$GITHUB_ENV" | ||
| echo "[perfmon] staged $perf_csv_count per-node perf_samples_*.csv to \$GITHUB_WORKSPACE/perf_samples/" | ||
| else | ||
| echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv under $LOGS_DIR — measured power aggregation skipped (compute-node visibility?)" >&2 | ||
| fi | ||
| fi | ||
|
|
||
| # Clean up srt-slurm outputs to prevent NFS silly-rename lock files | ||
| # from blocking the next job's checkout on this runner | ||
| echo "Cleaning up srt-slurm outputs..." | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -283,6 +283,20 @@ elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then | |
| git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" | ||
| cd "$SRT_REPO_DIR" | ||
| git checkout sa-submission-q2-2026 | ||
| elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" ]]; then | ||
| # MEASURED-POWER CAMPAIGN (dsr1-disagg-NVIDIA): clone the SemiAnalysisAI | ||
| # perfmon fork instead of cquil11/srt-slurm-nv. The fork ships the SAME | ||
| # gb200 dsr1 recipes (recipes/gb200-fp4|fp8/<seq>/*.yaml, model.path | ||
| # dsr1-fp4 / served deepseek-ai/DeepSeek-R1 — the exact CONFIG_FILE the | ||
| # dsr1-fp?-gb200-dynamo-sglang configs reference) PLUS the per-node | ||
| # nvidia-smi perfmon machinery (src/srtctl/monitor/perfmon.py) that writes | ||
| # perf_samples_*.csv when a recipe declares `monitoring:`. Pointing dsr1 here | ||
| # is what turns the gb200 dsr1 energy charts MEASURED. Minor recipe/srtctl | ||
| # drift vs cquil11's validated source is acceptable for a power/temp measure. | ||
| git clone https://github.com/SemiAnalysisAI/srt-slurm.git "$SRT_REPO_DIR" | ||
| cd "$SRT_REPO_DIR" | ||
| git checkout feat/inferencex-perfmon | ||
| export PERFMON_ENABLED=1 | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| else | ||
| git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR" | ||
| cd "$SRT_REPO_DIR" | ||
|
|
@@ -385,6 +399,23 @@ if [[ ! -f "$CONFIG_PATH" ]]; then | |
| fi | ||
| sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH" | ||
|
|
||
| # MEASURED-POWER CAMPAIGN: enable per-node nvidia-smi perfmon and give the slow | ||
| # dsr1 bring-up headroom. `monitoring:` makes the orchestrator spawn perfmon.py | ||
| # (-> perf_samples_*.csv); `health_check` raises the default 1800s server-ready | ||
| # ceiling (dsr1 warmup = load 671B weights + FlashInfer autotune + CUDA-graph | ||
| # capture ~= 35min > 30min default, which otherwise times out and kills etcd). | ||
| # Idempotent; only the perfmon-fork dsr1 path sets PERFMON_ENABLED. | ||
| if [ "${PERFMON_ENABLED:-0}" = "1" ] && [ -f "$CONFIG_PATH" ]; then | ||
| if ! grep -q '^monitoring:' "$CONFIG_PATH"; then | ||
| printf '\nmonitoring:\n enabled: true\n sample_interval: 1.0\n' >> "$CONFIG_PATH" | ||
| echo "[perfmon] injected monitoring: into $CONFIG_PATH" | ||
| fi | ||
| if ! grep -q '^health_check:' "$CONFIG_PATH"; then | ||
| printf '\nhealth_check:\n max_attempts: 540\n interval_seconds: 10\n' >> "$CONFIG_PATH" | ||
| echo "[perfmon] injected health_check (90min ceiling) into $CONFIG_PATH" | ||
| fi | ||
|
cursor[bot] marked this conversation as resolved.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. GB300 omits health_check bumpMedium Severity For the dsr1 perfmon path, gb200 injects a 90-minute Additional Locations (1)Reviewed by Cursor Bugbot for commit d6ace77. Configure here. |
||
| fi | ||
|
|
||
| if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then | ||
| SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1) | ||
| else | ||
|
|
@@ -447,6 +478,27 @@ else | |
| echo "Warning: Logs directory not found at $LOGS_DIR" | ||
| fi | ||
|
|
||
| # MEASURED-POWER CAMPAIGN: stage per-node perfmon CSVs for the downstream | ||
| # "Process result" step. perfmon writes perf_samples_*.csv under the job logs | ||
| # dir; copy them into $GITHUB_WORKSPACE and export GPU_METRICS_CSV_GLOB so | ||
| # process_result.py runs aggregate_power.py and patches the agg JSON with | ||
| # measured power + temp/util/mem. Guarded on PERFMON_ENABLED (dsr1 path only). | ||
| # NOTE: if perf_samples are missing on this cluster, the per-node CSVs (written | ||
| # by perfmon on COMPUTE nodes) may not be visible in the head-node LOGS_DIR — | ||
| # the Oracle/watchtower gb200 cluster does not cross-mount /home/slurm-shared | ||
| # (see the minimax shared-FS handling above); dsr1 may need the same treatment. | ||
| if [ "${PERFMON_ENABLED:-0}" = "1" ] && [ -d "$LOGS_DIR" ]; then | ||
| if find "$LOGS_DIR" -name 'perf_samples_*.csv' 2>/dev/null | grep -q .; then | ||
| mkdir -p "$GITHUB_WORKSPACE/perf_samples" | ||
| find "$LOGS_DIR" -name 'perf_samples_*.csv' -exec cp {} "$GITHUB_WORKSPACE/perf_samples/" \; | ||
| perf_csv_count=$(ls "$GITHUB_WORKSPACE/perf_samples"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ') | ||
| echo "GPU_METRICS_CSV_GLOB=$GITHUB_WORKSPACE/perf_samples/perf_samples_*.csv" >> "$GITHUB_ENV" | ||
| echo "[perfmon] staged $perf_csv_count per-node perf_samples_*.csv to \$GITHUB_WORKSPACE/perf_samples/" | ||
| else | ||
| echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv under $LOGS_DIR — measured power aggregation skipped (compute-node visibility? see note above)" >&2 | ||
| fi | ||
| fi | ||
|
|
||
| if [[ "${EVAL_ONLY:-false}" != "true" ]]; then | ||
| if [ ! -d "$LOGS_DIR" ]; then | ||
| exit 1 | ||
|
|
||


Uh oh!
There was an error while loading. Please reload this page.