CollectiveX Experimental #41
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: CollectiveX Experimental | |
| # Orchestration only — all benchmark logic lives in experimental/CollectiveX/. | |
| # Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no | |
| # merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane | |
| # for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's | |
| # self-hosted runner and invokes that SKU's launch script — the same | |
| # launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use. | |
| on: | |
| push: | |
| branches: | |
| - collectivex | |
| paths: | |
| - 'experimental/CollectiveX/**' | |
| - '.github/workflows/collectivex-experimental.yml' | |
| workflow_dispatch: | |
| inputs: | |
| sku: | |
| # Only SKUs with a matching launchers/launch_<prefix>.sh are offered — | |
| # runner.name's prefix selects the script, so an SKU without one fails. | |
| description: Self-hosted runner pool (must have a CollectiveX launcher) | |
| type: choice | |
| default: gb200 | |
| options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, b300] | |
| benchmark: | |
| # mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs. | |
| description: Which benchmark to run | |
| type: choice | |
| default: nccl | |
| options: [nccl, deepep, mori, all] | |
| ops: | |
| description: NCCL ops (space-separated); blank = default set | |
| type: string | |
| default: '' | |
| min_bytes: | |
| description: nccl-tests min message size | |
| type: string | |
| default: '8' | |
| max_bytes: | |
| description: nccl-tests max message size | |
| type: string | |
| default: '8G' | |
| ngpus: | |
| description: GPUs per node (blank = SKU default) | |
| type: string | |
| default: '' | |
| nodes: | |
| description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node. | |
| type: string | |
| default: '' | |
| phase: | |
| # EP only. 'both' fans out to one job per phase (decode + prefill). | |
| description: EP phase — decode (small T) / prefill (large T); 'both' = a job each | |
| type: choice | |
| default: both | |
| options: [both, decode, prefill] | |
| tokens_ladder: | |
| description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default | |
| type: string | |
| default: '' | |
| dispatch_dtype: | |
| description: EP dispatch payload precision | |
| type: choice | |
| default: bf16 | |
| options: [bf16, fp8] | |
| mode: | |
| # normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency | |
| # (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it | |
| # (MoRI) and aborts on fabrics that lack it (B300) — run only where supported. | |
| description: EP kernel path — normal or low-latency (LL) | |
| type: choice | |
| default: normal | |
| options: [normal, ll] | |
| resource_mode: | |
| # normalized = ~sm_fraction of device units (cross-vendor apples-to-apples); | |
| # tuned = each backend's own recommended/default launch config. | |
| description: Comm resource regime | |
| type: choice | |
| default: normalized | |
| options: [normalized, tuned, default] | |
| concurrency: | |
| # Include the dispatch SKU so two workflow_dispatch runs on different SKUs do | |
| # not cancel each other; push has no sku input -> shares one 'push' group. | |
| group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }} | |
| cancel-in-progress: true | |
| permissions: | |
| contents: read | |
| jobs: | |
| # Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and | |
| # runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute- | |
| # visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs. | |
| experimental: | |
| name: CollectiveX Experimental (${{ matrix.phase }}) | |
| if: github.event_name == 'push' | |
| runs-on: mi355x | |
| timeout-minutes: 90 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| # Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch. | |
| phase: [decode] | |
| env: | |
| CX_BENCH: mori | |
| CX_PHASE: ${{ matrix.phase }} | |
| # SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently | |
| # WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung | |
| # ~1 h before the job timeout. Keep the push smoke in the known-good range; run the | |
| # full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed. | |
| CX_TOKENS_LADDER: "1 2 4 8 16" | |
| CX_RUN_TIMEOUT: "600" | |
| # Pin to the MI355X nodes that hold the node-local squash and have a writable | |
| # /var/lib/squash; other nodes need a slow cold import that can fail on lock/ | |
| # cache permissions. Widen once the squash is staged cluster-wide. | |
| CX_NODELIST: mia1-p01-g10,mia1-p01-g15 | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 | |
| with: { clean: true } | |
| - name: Launch MI355X MoRI (${{ matrix.phase }}) | |
| env: | |
| RUNNER_NAME: ${{ runner.name }} | |
| run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" | |
| - name: Results summary | |
| if: always() | |
| run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 | |
| with: | |
| name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }} | |
| path: experimental/CollectiveX/results/*.json | |
| if-no-files-found: warn | |
| # Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner. | |
| dispatch: | |
| if: github.event_name == 'workflow_dispatch' | |
| runs-on: ${{ inputs.sku }} | |
| timeout-minutes: 120 | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| # 'both' -> one job per phase (decode + prefill); else a single job. Phase | |
| # only affects EP (deepep/mori); nccl ignores it (runs the same twice). | |
| phase: ${{ fromJSON(inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase)) }} | |
| env: | |
| CX_BENCH: ${{ inputs.benchmark }} | |
| CX_OPS: ${{ inputs.ops }} | |
| CX_MIN_BYTES: ${{ inputs.min_bytes }} | |
| CX_MAX_BYTES: ${{ inputs.max_bytes }} | |
| CX_NGPUS: ${{ inputs.ngpus }} | |
| CX_NODES: ${{ inputs.nodes }} | |
| CX_PHASE: ${{ matrix.phase }} | |
| CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }} | |
| CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }} | |
| CX_MODE: ${{ inputs.mode }} | |
| CX_RESOURCE_MODE: ${{ inputs.resource_mode }} | |
| # GB200/watchtower needs a compute-visible workspace; harmless elsewhere. | |
| CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }} | |
| # MI355X: pin to the warm-squash, writable nodes (see the push job). | |
| CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }} | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0 | |
| with: { clean: true } | |
| - name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }}) | |
| env: | |
| RUNNER_NAME: ${{ runner.name }} | |
| run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh" | |
| - name: Results summary | |
| if: always() | |
| run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY" | |
| - name: Upload results | |
| if: always() | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 | |
| with: | |
| name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }} | |
| path: experimental/CollectiveX/results/*.json | |
| if-no-files-found: warn |