Skip to content

CollectiveX Experimental #41

CollectiveX Experimental

CollectiveX Experimental #41

name: CollectiveX Experimental
# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no
# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane
# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's
# self-hosted runner and invokes that SKU's launch script — the same
# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use.
on:
push:
branches:
- collectivex
paths:
- 'experimental/CollectiveX/**'
- '.github/workflows/collectivex-experimental.yml'
workflow_dispatch:
inputs:
sku:
# Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
# runner.name's prefix selects the script, so an SKU without one fails.
description: Self-hosted runner pool (must have a CollectiveX launcher)
type: choice
default: gb200
options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, b300]
benchmark:
# mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs.
description: Which benchmark to run
type: choice
default: nccl
options: [nccl, deepep, mori, all]
ops:
description: NCCL ops (space-separated); blank = default set
type: string
default: ''
min_bytes:
description: nccl-tests min message size
type: string
default: '8'
max_bytes:
description: nccl-tests max message size
type: string
default: '8G'
ngpus:
description: GPUs per node (blank = SKU default)
type: string
default: ''
nodes:
description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
type: string
default: ''
phase:
# EP only. 'both' fans out to one job per phase (decode + prefill).
description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
type: choice
default: both
options: [both, decode, prefill]
tokens_ladder:
description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
type: string
default: ''
dispatch_dtype:
description: EP dispatch payload precision
type: choice
default: bf16
options: [bf16, fp8]
mode:
# normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency
# (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it
# (MoRI) and aborts on fabrics that lack it (B300) — run only where supported.
description: EP kernel path — normal or low-latency (LL)
type: choice
default: normal
options: [normal, ll]
resource_mode:
# normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
# tuned = each backend's own recommended/default launch config.
description: Comm resource regime
type: choice
default: normalized
options: [normalized, tuned, default]
concurrency:
# Include the dispatch SKU so two workflow_dispatch runs on different SKUs do
# not cancel each other; push has no sku input -> shares one 'push' group.
group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku || 'push' }}
cancel-in-progress: true
permissions:
contents: read
jobs:
# Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and
# runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
# visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
experimental:
name: CollectiveX Experimental (${{ matrix.phase }})
if: github.event_name == 'push'
runs-on: mi355x
timeout-minutes: 90
strategy:
fail-fast: false
matrix:
# Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch.
phase: [decode]
env:
CX_BENCH: mori
CX_PHASE: ${{ matrix.phase }}
# SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently
# WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung
# ~1 h before the job timeout. Keep the push smoke in the known-good range; run the
# full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed.
CX_TOKENS_LADDER: "1 2 4 8 16"
CX_RUN_TIMEOUT: "600"
# Pin to the MI355X nodes that hold the node-local squash and have a writable
# /var/lib/squash; other nodes need a slow cold import that can fail on lock/
# cache permissions. Widen once the squash is staged cluster-wide.
CX_NODELIST: mia1-p01-g10,mia1-p01-g15
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
with: { clean: true }
- name: Launch MI355X MoRI (${{ matrix.phase }})
env:
RUNNER_NAME: ${{ runner.name }}
run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
- name: Results summary
if: always()
run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
- name: Upload results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }}
path: experimental/CollectiveX/results/*.json
if-no-files-found: warn
# Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
dispatch:
if: github.event_name == 'workflow_dispatch'
runs-on: ${{ inputs.sku }}
timeout-minutes: 120
strategy:
fail-fast: false
matrix:
# 'both' -> one job per phase (decode + prefill); else a single job. Phase
# only affects EP (deepep/mori); nccl ignores it (runs the same twice).
phase: ${{ fromJSON(inputs.phase == 'both' && '["decode","prefill"]' || format('["{0}"]', inputs.phase)) }}
env:
CX_BENCH: ${{ inputs.benchmark }}
CX_OPS: ${{ inputs.ops }}
CX_MIN_BYTES: ${{ inputs.min_bytes }}
CX_MAX_BYTES: ${{ inputs.max_bytes }}
CX_NGPUS: ${{ inputs.ngpus }}
CX_NODES: ${{ inputs.nodes }}
CX_PHASE: ${{ matrix.phase }}
CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
CX_MODE: ${{ inputs.mode }}
CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
# GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' || '' }}
# MI355X: pin to the warm-squash, writable nodes (see the push job).
CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' || '' }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
with: { clean: true }
- name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
env:
RUNNER_NAME: ${{ runner.name }}
run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
- name: Results summary
if: always()
run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
- name: Upload results
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
path: experimental/CollectiveX/results/*.json
if-no-files-found: warn