CollectiveX Experimental #41

Workflow file for this run

.github/workflows/collectivex-experimental.yml at 9f85d05

	name: CollectiveX Experimental

	# Orchestration only — all benchmark logic lives in experimental/CollectiveX/.
	# Push to the feature branch runs the MI355X MoRI dispatch/combine benchmark (no
	# merge to main needed); workflow_dispatch runs a chosen SKU + benchmark (the lane
	# for GB200/B200 NCCL, DeepEP, and larger sweeps). Each job lands on the SKU's
	# self-hosted runner and invokes that SKU's launch script — the same
	# launch_${RUNNER_NAME%%_*}.sh convention the serving benchmarks use.

	on:
	push:
	branches:
	- collectivex
	paths:
	- 'experimental/CollectiveX/**'
	- '.github/workflows/collectivex-experimental.yml'
	workflow_dispatch:
	inputs:
	sku:
	# Only SKUs with a matching launchers/launch_<prefix>.sh are offered —
	# runner.name's prefix selects the script, so an SKU without one fails.
	description: Self-hosted runner pool (must have a CollectiveX launcher)
	type: choice
	default: gb200
	options: [gb200, b200-dgxc, b200-multinode, mi355x, h100-dgxc, b300]
	benchmark:
	# mori runs only on mi355x; nccl/deepep/all on the NVIDIA SKUs.
	description: Which benchmark to run
	type: choice
	default: nccl
	options: [nccl, deepep, mori, all]
	ops:
	description: NCCL ops (space-separated); blank = default set
	type: string
	default: ''
	min_bytes:
	description: nccl-tests min message size
	type: string
	default: '8'
	max_bytes:
	description: nccl-tests max message size
	type: string
	default: '8G'
	ngpus:
	description: GPUs per node (blank = SKU default)
	type: string
	default: ''
	nodes:
	description: Node count (gb200 multi-node MNNVL; 2 = 8 GPU). Blank/1 = single node.
	type: string
	default: ''
	phase:
	# EP only. 'both' fans out to one job per phase (decode + prefill).
	description: EP phase — decode (small T) / prefill (large T); 'both' = a job each
	type: choice
	default: both
	options: [both, decode, prefill]
	tokens_ladder:
	description: EP source-tokens-per-rank sweep (space/comma sep); blank = phase default
	type: string
	default: ''
	dispatch_dtype:
	description: EP dispatch payload precision
	type: choice
	default: bf16
	options: [bf16, fp8]
	mode:
	# normal = high-throughput kernels (decode+prefill); ll = DeepEP low-latency
	# (decode-shaped, fp8 cast in-kernel). LL is rejected on backends without it
	# (MoRI) and aborts on fabrics that lack it (B300) — run only where supported.
	description: EP kernel path — normal or low-latency (LL)
	type: choice
	default: normal
	options: [normal, ll]
	resource_mode:
	# normalized = ~sm_fraction of device units (cross-vendor apples-to-apples);
	# tuned = each backend's own recommended/default launch config.
	description: Comm resource regime
	type: choice
	default: normalized
	options: [normalized, tuned, default]

	concurrency:
	# Include the dispatch SKU so two workflow_dispatch runs on different SKUs do
	# not cancel each other; push has no sku input -> shares one 'push' group.
	group: collectivex-${{ github.ref }}-${{ github.event_name }}-${{ inputs.sku \|\| 'push' }}
	cancel-in-progress: true

	permissions:
	contents: read

	jobs:
	# Push -> MI355X MoRI dispatch/combine. Lands on a free mi355x-amds runner and
	# runs launch_mi355x-amds.sh (CX_BENCH=mori). The AMD workspace is compute-
	# visible, so no CX_STAGE_DIR; the launcher defaults to 8 GPUs.
	experimental:
	name: CollectiveX Experimental (${{ matrix.phase }})
	if: github.event_name == 'push'
	runs-on: mi355x
	timeout-minutes: 90
	strategy:
	fail-fast: false
	matrix:
	# Push = a fast MoRI SMOKE only (decode). The full sweep is workflow_dispatch.
	phase: [decode]
	env:
	CX_BENCH: mori
	CX_PHASE: ${{ matrix.phase }}
	# SMOKE ladder capped at T<=16: MoRI + realistic (fan-out≈5.3) routing currently
	# WEDGES at T>=32 (under investigation; DeepEP is fine), and an unguarded run hung
	# ~1 h before the job timeout. Keep the push smoke in the known-good range; run the
	# full sweep via workflow_dispatch (timeout-guarded). Remove the cap once fixed.
	CX_TOKENS_LADDER: "1 2 4 8 16"
	CX_RUN_TIMEOUT: "600"
	# Pin to the MI355X nodes that hold the node-local squash and have a writable
	# /var/lib/squash; other nodes need a slow cold import that can fail on lock/
	# cache permissions. Widen once the squash is staged cluster-wide.
	CX_NODELIST: mia1-p01-g10,mia1-p01-g15
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
	with: { clean: true }
	- name: Launch MI355X MoRI (${{ matrix.phase }})
	env:
	RUNNER_NAME: ${{ runner.name }}
	run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
	- name: Results summary
	if: always()
	run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
	- name: Upload results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: collectivex_mi355x_mori_${{ matrix.phase }}_${{ github.run_id }}
	path: experimental/CollectiveX/results/*.json
	if-no-files-found: warn

	# Manual dispatch -> chosen SKU + benchmark. Lands on the inputs.sku runner.
	dispatch:
	if: github.event_name == 'workflow_dispatch'
	runs-on: ${{ inputs.sku }}
	timeout-minutes: 120
	strategy:
	fail-fast: false
	matrix:
	# 'both' -> one job per phase (decode + prefill); else a single job. Phase
	# only affects EP (deepep/mori); nccl ignores it (runs the same twice).
	phase: ${{ fromJSON(inputs.phase == 'both' && '["decode","prefill"]' \|\| format('["{0}"]', inputs.phase)) }}
	env:
	CX_BENCH: ${{ inputs.benchmark }}
	CX_OPS: ${{ inputs.ops }}
	CX_MIN_BYTES: ${{ inputs.min_bytes }}
	CX_MAX_BYTES: ${{ inputs.max_bytes }}
	CX_NGPUS: ${{ inputs.ngpus }}
	CX_NODES: ${{ inputs.nodes }}
	CX_PHASE: ${{ matrix.phase }}
	CX_TOKENS_LADDER: ${{ inputs.tokens_ladder }}
	CX_DISPATCH_DTYPE: ${{ inputs.dispatch_dtype }}
	CX_MODE: ${{ inputs.mode }}
	CX_RESOURCE_MODE: ${{ inputs.resource_mode }}
	# GB200/watchtower needs a compute-visible workspace; harmless elsewhere.
	CX_STAGE_DIR: ${{ inputs.sku == 'gb200' && '/mnt/lustre01/users-public/sa-shared/cx-stage' \|\| '' }}
	# MI355X: pin to the warm-squash, writable nodes (see the push job).
	CX_NODELIST: ${{ inputs.sku == 'mi355x' && 'mia1-p01-g10,mia1-p01-g15' \|\| '' }}
	steps:
	- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v5.0.0
	with: { clean: true }
	- name: Launch ${{ inputs.sku }} / ${{ inputs.benchmark }} (${{ matrix.phase }})
	env:
	RUNNER_NAME: ${{ runner.name }}
	run: bash "experimental/CollectiveX/launchers/launch_${RUNNER_NAME%%_*}.sh"
	- name: Results summary
	if: always()
	run: python3 experimental/CollectiveX/summarize.py --results-dir experimental/CollectiveX/results --markdown >> "$GITHUB_STEP_SUMMARY"
	- name: Upload results
	if: always()
	uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
	with:
	name: collectivex_${{ inputs.sku }}_${{ inputs.benchmark }}_${{ matrix.phase }}_${{ github.run_id }}
	path: experimental/CollectiveX/results/*.json
	if-no-files-found: warn

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

CollectiveX Experimental #41

Workflow file

CollectiveX Experimental #41

Uh oh!

Workflow file for this run