SemiAnalysisAI · arygupt · Jun 15, 2026 · Jun 15, 2026 · Jun 16, 2026 · Jun 5, 2026
@@ -12143,3 +12143,122 @@ kimik2.5-fp4-gb300-dynamo-vllm:
           tp: 1
           ep: 24
           dp-attn: true
+
+# --- Measured-power campaign (dsr1-disagg-NVIDIA) — gb200 -------------------
+# Single-job validation of the perfmon plumbing on the HEALTHY gb200 runner
+# (the gb300-nv fleet is wedged on stale-NFS pre-run cleanup). Same recipe +
+# disagg topology as the 1k/1k low-latency cell of dsr1-fp4-gb200-dynamo-sglang
+# (1 prefill TP4 + 2 decode TP4), trimmed to one concurrency so the matrix
+# expands to ONE gb200 job. runner: gb200 -> runners/launch_gb200-nv.sh, whose
+# dsr1 branch now clones the perfmon fork (gb200 dsr1 recipes + nvidia-smi
+# perfmon) -> per-node perf_samples_*.csv -> measured per-phase power+temp.
+# CANONICAL NVIDIA data, sidestepping the wedged gb300-nv fleet. Run-only.
+dsr1-fp4-gb200-dynamo-sglang-powercheck:
+  image: "lmsysorg/sglang:v0.5.8-cu130"
+  model: nvidia/DeepSeek-R1-0528-NVFP4-v2
+  model-prefix: dsr1
+  runner: gb200
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/gb200-fp4/1k1k/low-latency.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
+
+# --- Measured-power campaign (dsr1-disagg-NVIDIA) — b200 --------------------
+# Single-job validation of the perfmon plumbing on the b200-multinode pool
+# (b200-dgxc-slurm_{7,8,9} -> runners/launch_b200-dgxc.sh). Mirrors the first
+# 1k/1k search-space cell of dsr1-fp4-b200-dynamo-sglang EXACTLY
+# (zip_override_stp_lowlat[0] = STP, 1 prefill TP4 DP4 EP4 + 5 decode TP8 EP8),
+# trimmed to a single concurrency so the matrix expands to ONE b200 job. The
+# b200-multinode pool is the only b200 multinode pool and has been idle 60+
+# sweeps, so its health is UNVERIFIED (the green b200 runs were the single-node
+# pool); idle = no accumulated stale benchmark_logs, so it is the best bet of the
+# wedged/idle NVIDIA multinode pools. runner: b200-multinode -> launch_b200-dgxc.sh,
+# whose dsr1 branch now clones the perfmon fork (b200 dsr1 recipe + nvidia-smi
+# perfmon), inserts monitoring: INTO base: + a 90min health_check, and stages
+# perf_samples_*.csv -> measured per-phase power + temp. CANONICAL NVIDIA (b200).
+# Run-only; do not merge (duplicates #1574 consumer code).
+dsr1-fp4-b200-dynamo-sglang-powercheck:
+  image: lmsysorg/sglang:v0.5.8.post1-cu130-runtime
+  model: deepseek-r1-fp4
+  model-prefix: dsr1
+  runner: b200-multinode
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - conc-list:
+        - 16
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "CONFIG_FILE=recipes/b200-fp4/1k1k.yaml:zip_override_stp_lowlat[0]"
+        decode:
+          num-worker: 5
+          tp: 8
+          ep: 8
+          dp-attn: false
+
+
+# --- Measured-power campaign (dsr1-disagg-NVIDIA) ---------------------------
+# Minimal single-job validation of the perfmon plumbing on GB300 before the
+# full dsr1-disagg-NVIDIA measured sweep. Identical to the 1k1k low-latency
+# scenario of dsr1-fp4-gb300-dynamo-sglang (same recipe + topology), trimmed
+# to one concurrency so it runs as ONE gb300 job. runner: gb300-nv routes to
+# runners/launch_gb300-nv.sh, whose dsr1 branch clones the perfmon fork and
+# injects monitoring: -> per-node perf_samples_*.csv -> measured per-phase
+# board power. Remove once the path is proven (or keep as a power canary).
+dsr1-fp4-gb300-dynamo-sglang-powercheck:
+  image: "lmsysorg/sglang:v0.5.8.post1-cu130-runtime"
+  model: nvidia/DeepSeek-R1-0528-NVFP4-v2
+  model-prefix: dsr1
+  runner: gb300-nv
+  precision: fp4
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 1024
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [ 8 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 1
+          dp-attn: false
+          additional-settings:
+          - "CONFIG_FILE=recipes/gb300-fp4/1k1k/low_latency.yaml"
+        decode:
+          num-worker: 2
+          tp: 4
+          ep: 1
+          dp-attn: false
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -177,12 +177,30 @@ jobs:
         run: &slurm-cleanup |
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up jobs with name: ${{ runner.name }} ..."
-            scancel --name="${{ runner.name }}" || true
-            while [ -n "$(squeue --name='${{ runner.name }}' --noheader --format='%i')" ]; do
-              squeue --name="${{ runner.name }}"
+            timeout 30 scancel --name="${{ runner.name }}" 2>/dev/null || true
+            _drain_deadline=$((SECONDS + 120))
+            # Wrap EVERY slurm call in `timeout`: on the NVIDIA clusters squeue
+            # itself hangs (unresponsive slurmctld / zombie), and a hang in the
+            # while-condition's $(squeue ...) never reaches the deadline check
+            # below. timeout 30 forces each call to return so the loop progresses.
+            while [ -n "$(timeout 30 squeue --name='${{ runner.name }}' --noheader --format='%i' 2>/dev/null)" ]; do
+              if [ "$SECONDS" -ge "$_drain_deadline" ]; then
+                echo "[Slurm] drain exceeded 120s; force-cancelling (KILL) and proceeding (zombie/unresponsive slurm)"
+                timeout 30 scancel --signal=KILL --name="${{ runner.name }}" 2>/dev/null || true
+                sleep 5
+                break
+              fi
+              timeout 30 squeue --name="${{ runner.name }}" 2>/dev/null || true
               sleep 5
             done
           fi
+          # Drop root-owned leftovers from a prior (often cancelled) multinode
+          # run. The benchmark container runs as root and writes benchmark_logs/;
+          # if the job was cancelled its cleanup trap never ran, leaving
+          # root-owned dirs that actions/checkout (clean: true) can't rmdir
+          # (EACCES) — which then poison-fails EVERY subsequent job on that
+          # runner. Runs in both pre- and post-run cleanup (shared anchor).
+          timeout 60 sudo rm -rf "${GITHUB_WORKSPACE}/benchmark_logs" 2>/dev/null || true
 
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -4028,3 +4028,13 @@
     - "Recover the failed official ingest for PR #1796 from validated sweep run 27663808752 (attempt 2)"
     - "Artifact-only recovery: reuse 23 fixed-sequence rows and 2 eval rows without rerunning benchmarks"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1884
+
+- config-keys:
+    - dsr1-fp4-gb300-dynamo-sglang-powercheck
+  description:
+    - "Minimal single-job GB300 validation of the measured-power perfmon plumbing before the full dsr1-disagg-NVIDIA sweep (NVIDIA analogue of the AMD smoke run in PR #1574). Same recipe + disagg topology as the 1k/1k low-latency cell of dsr1-fp4-gb300-dynamo-sglang (1 prefill TP4 + 2 decode TP4), trimmed to one concurrency (8) so the changelog matrix expands to exactly ONE gb300 job and the shared cluster stays clear."
+    - "Exercises the runner-side wiring added to runners/launch_gb300-nv.sh: the dsr1 branch clones SemiAnalysisAI/srt-slurm@feat/inferencex-perfmon (NVIDIA/srt-slurm PR #35) instead of upstream sa-submission, recursively injects `monitoring:` into every recipes/<hw>/<seq>/*.yaml (find -type f, never a flat glob — the flat glob is what silently produced 0 power rows in sweep #26548110246), and stages the per-node perf_samples_*.csv to $GITHUB_WORKSPACE before `rm -rf outputs`, setting GPU_METRICS_CSV_GLOB for the Process-result step."
+    - "Success criteria: job green AND the agg JSON patched with avg_power_w + per-stage prefill_avg_power_w/decode_avg_power_w + workers[] (role-labelled prefill/decode) from utils/aggregate_power.py. If those fields are absent the plumbing is not yet proven and the full dsr1-disagg-NVIDIA sweep stays gated. Remove this key (or keep as a GB300 power canary) once validated."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1791
+# re-arm 2026-06-22b: validate perfmon teardown fix (srt-slurm@b9526e5) — probe for pre-existing CG job
+# re-arm 2026-06-22c: cancel+re-arm gamble for a healthy runner — _0 squeue/scancel hang confirmed LIVE (concurrent probe on gb300-nv_2 clean, squeue 37ms, queue empty; _0 wedged 8min in pre-run cleanup with empty queue = host-local slurm-client hang, not a stuck job)
diff --git a/runners/launch_b200-dgxc.sh b/runners/launch_b200-dgxc.sh
@@ -128,11 +128,19 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
         mkdir -p recipes/sglang/glm5/b200-fp8
         cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/b200-fp8" recipes/sglang/glm5/b200-fp8
     elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" && $PRECISION == "fp4" ]]; then
-        git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
+        # MEASURED-POWER CAMPAIGN (dsr1-disagg-NVIDIA): clone the SemiAnalysisAI
+        # perfmon fork (NVIDIA/srt-slurm PR #35, feat/inferencex-perfmon) instead
+        # of NVIDIA/srt-slurm@main. The fork ships the SAME b200 dsr1 recipe this
+        # config points at (recipes/b200-fp4/1k1k.yaml) PLUS the per-node
+        # nvidia-smi perfmon machinery (src/srtctl/monitor/perfmon.py) that writes
+        # perf_samples_*.csv when a recipe declares `monitoring:`. That is what
+        # turns the b200 dsr1-disagg energy charts MEASURED. The recipe lives in
+        # the fork directly, so the prior mkdir/cp into recipes/sglang/dsr1/b200-fp4
+        # (a different path, unused by this config's CONFIG_FILE) is dropped.
+        git clone https://github.com/SemiAnalysisAI/srt-slurm.git "$SRT_REPO_DIR"
         cd "$SRT_REPO_DIR" || exit 1
-        git checkout main
-        mkdir -p recipes/sglang/dsr1/b200-fp4
-        cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4" recipes/sglang/dsr1/b200-fp4
+        git checkout feat/inferencex-perfmon
+        export PERFMON_ENABLED=1
     else
         git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
         cd "$SRT_REPO_DIR" || exit 1
@@ -259,6 +267,23 @@ EOF
     # so large-model loads (e.g. DSR1-FP8 ~680GB off shared FS) finish in time.
     # Uses ${CONFIG_FILE%%:*} because CONFIG_FILE may carry an :override[N] suffix.
     sed -i 's/^  max_attempts: [0-9]*/  max_attempts: 720/' "${CONFIG_FILE%%:*}"
+    # MEASURED-POWER CAMPAIGN: enable per-node nvidia-smi perfmon and give the
+    # slow dsr1 bring-up headroom. This recipe is `base:` + `zip_override_*`, so
+    # `monitoring:` must be inserted INTO base: (2-space indent) to survive the
+    # override zip — a top-level EOF append (the flat-recipe trick) is ignored
+    # here. health_check lives at base.health_check.max_attempts (4-space), which
+    # the 2-space sed above does NOT match, so bump it here (360->540, i.e.
+    # 60min->90min, for the ~35min dsr1 warmup). Guarded on PERFMON_ENABLED so
+    # only the perfmon-fork dsr1 path is affected.
+    if [ "${PERFMON_ENABLED:-0}" = "1" ]; then
+        CFG="${CONFIG_FILE%%:*}"
+        if [ -f "$CFG" ] && ! grep -q '^[[:space:]]*monitoring:' "$CFG"; then
+            awk '/^base:/{print; print "  monitoring:"; print "    enabled: true"; print "    sample_interval: 1.0"; next} {print}' "$CFG" > "$CFG.perfmon.tmp" && mv "$CFG.perfmon.tmp" "$CFG"
+            echo "[perfmon] injected monitoring: under base: in $CFG"
+        fi
+        sed -i 's/^    max_attempts: [0-9]*/    max_attempts: 540/' "$CFG"
+        echo "[perfmon] set base.health_check.max_attempts=540 (90min) in $CFG"
+    fi
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "b200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
     echo "$SRTCTL_OUTPUT"
 
@@ -384,6 +409,25 @@ EOF
         fi
     fi
 
+    # MEASURED-POWER CAMPAIGN: stage per-node perfmon CSVs for the downstream
+    # "Process result" step BEFORE the outputs cleanup below deletes them.
+    # perfmon writes perf_samples_*.csv under the per-node job logs; copy them
+    # into $GITHUB_WORKSPACE and export GPU_METRICS_CSV_GLOB so aggregate_power.py
+    # emits measured power + temp/util/mem. b200-dgxc logs live on shared Lustre
+    # (LOGS_DIR is populated by the compute nodes), so a recursive find picks up
+    # every node's CSV. Guarded on PERFMON_ENABLED (dsr1 perfmon-fork path only).
+    if [ "${PERFMON_ENABLED:-0}" = "1" ] && [ -d "$LOGS_DIR" ]; then
+        if find "$LOGS_DIR" -name 'perf_samples_*.csv' 2>/dev/null | grep -q .; then
+            mkdir -p "$GITHUB_WORKSPACE/perf_samples"
+            find "$LOGS_DIR" -name 'perf_samples_*.csv' -exec cp {} "$GITHUB_WORKSPACE/perf_samples/" \;
+            perf_csv_count=$(ls "$GITHUB_WORKSPACE/perf_samples"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ')
+            echo "GPU_METRICS_CSV_GLOB=$GITHUB_WORKSPACE/perf_samples/perf_samples_*.csv" >> "$GITHUB_ENV"
+            echo "[perfmon] staged $perf_csv_count per-node perf_samples_*.csv to \$GITHUB_WORKSPACE/perf_samples/"
+        else
+            echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv under $LOGS_DIR — measured power aggregation skipped (compute-node visibility?)" >&2
+        fi
+    fi
+
     # Clean up srt-slurm outputs to prevent NFS silly-rename lock files
     # from blocking the next job's checkout on this runner
     echo "Cleaning up srt-slurm outputs..."

diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh
@@ -283,6 +283,20 @@ elif [[ $FRAMEWORK == "dynamo-trt" && $MODEL_PREFIX == "kimik2.5" ]]; then
     git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
     git checkout sa-submission-q2-2026
+elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "dsr1" ]]; then
+    # MEASURED-POWER CAMPAIGN (dsr1-disagg-NVIDIA): clone the SemiAnalysisAI
+    # perfmon fork instead of cquil11/srt-slurm-nv. The fork ships the SAME
+    # gb200 dsr1 recipes (recipes/gb200-fp4|fp8/<seq>/*.yaml, model.path
+    # dsr1-fp4 / served deepseek-ai/DeepSeek-R1 — the exact CONFIG_FILE the
+    # dsr1-fp?-gb200-dynamo-sglang configs reference) PLUS the per-node
+    # nvidia-smi perfmon machinery (src/srtctl/monitor/perfmon.py) that writes
+    # perf_samples_*.csv when a recipe declares `monitoring:`. Pointing dsr1 here
+    # is what turns the gb200 dsr1 energy charts MEASURED. Minor recipe/srtctl
+    # drift vs cquil11's validated source is acceptable for a power/temp measure.
+    git clone https://github.com/SemiAnalysisAI/srt-slurm.git "$SRT_REPO_DIR"
+    cd "$SRT_REPO_DIR"
+    git checkout feat/inferencex-perfmon
+    export PERFMON_ENABLED=1
 else
     git clone --branch cam/sa-submission-q2-2026 --single-branch https://github.com/cquil11/srt-slurm-nv.git "$SRT_REPO_DIR"
     cd "$SRT_REPO_DIR"
@@ -385,6 +399,23 @@ if [[ ! -f "$CONFIG_PATH" ]]; then
 fi
 sed -i "s/^name:.*/name: \"${RUNNER_NAME}\"/" "$CONFIG_PATH"
 
+# MEASURED-POWER CAMPAIGN: enable per-node nvidia-smi perfmon and give the slow
+# dsr1 bring-up headroom. `monitoring:` makes the orchestrator spawn perfmon.py
+# (-> perf_samples_*.csv); `health_check` raises the default 1800s server-ready
+# ceiling (dsr1 warmup = load 671B weights + FlashInfer autotune + CUDA-graph
+# capture ~= 35min > 30min default, which otherwise times out and kills etcd).
+# Idempotent; only the perfmon-fork dsr1 path sets PERFMON_ENABLED.
+if [ "${PERFMON_ENABLED:-0}" = "1" ] && [ -f "$CONFIG_PATH" ]; then
+    if ! grep -q '^monitoring:' "$CONFIG_PATH"; then
+        printf '\nmonitoring:\n  enabled: true\n  sample_interval: 1.0\n' >> "$CONFIG_PATH"
+        echo "[perfmon] injected monitoring: into $CONFIG_PATH"
+    fi
+    if ! grep -q '^health_check:' "$CONFIG_PATH"; then
+        printf '\nhealth_check:\n  max_attempts: 540\n  interval_seconds: 10\n' >> "$CONFIG_PATH"
+        echo "[perfmon] injected health_check (90min ceiling) into $CONFIG_PATH"
+    fi
+fi
+
 if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
     SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_PATH" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" --setup-script install-torchao.sh 2>&1)
 else
@@ -447,6 +478,27 @@ else
     echo "Warning: Logs directory not found at $LOGS_DIR"
 fi
 
+# MEASURED-POWER CAMPAIGN: stage per-node perfmon CSVs for the downstream
+# "Process result" step. perfmon writes perf_samples_*.csv under the job logs
+# dir; copy them into $GITHUB_WORKSPACE and export GPU_METRICS_CSV_GLOB so
+# process_result.py runs aggregate_power.py and patches the agg JSON with
+# measured power + temp/util/mem. Guarded on PERFMON_ENABLED (dsr1 path only).
+# NOTE: if perf_samples are missing on this cluster, the per-node CSVs (written
+# by perfmon on COMPUTE nodes) may not be visible in the head-node LOGS_DIR —
+# the Oracle/watchtower gb200 cluster does not cross-mount /home/slurm-shared
+# (see the minimax shared-FS handling above); dsr1 may need the same treatment.
+if [ "${PERFMON_ENABLED:-0}" = "1" ] && [ -d "$LOGS_DIR" ]; then
+    if find "$LOGS_DIR" -name 'perf_samples_*.csv' 2>/dev/null | grep -q .; then
+        mkdir -p "$GITHUB_WORKSPACE/perf_samples"
+        find "$LOGS_DIR" -name 'perf_samples_*.csv' -exec cp {} "$GITHUB_WORKSPACE/perf_samples/" \;
+        perf_csv_count=$(ls "$GITHUB_WORKSPACE/perf_samples"/perf_samples_*.csv 2>/dev/null | wc -l | tr -d ' ')
+        echo "GPU_METRICS_CSV_GLOB=$GITHUB_WORKSPACE/perf_samples/perf_samples_*.csv" >> "$GITHUB_ENV"
+        echo "[perfmon] staged $perf_csv_count per-node perf_samples_*.csv to \$GITHUB_WORKSPACE/perf_samples/"
+    else
+        echo "[perfmon] WARNING: monitoring enabled but no perf_samples_*.csv under $LOGS_DIR — measured power aggregation skipped (compute-node visibility? see note above)" >&2
+    fi
+fi
+
 if [[ "${EVAL_ONLY:-false}" != "true" ]]; then
     if [ ! -d "$LOGS_DIR" ]; then
         exit 1