Skip to content

Feature: add umbp benchmark / distributed mode flush API / hot-path o… #1281

Feature: add umbp benchmark / distributed mode flush API / hot-path o…

Feature: add umbp benchmark / distributed mode flush API / hot-path o… #1281

Workflow file for this run

name: MoRI CI test
on:
push:
branches: [main]
pull_request:
branches: [main]
workflow_dispatch:
inputs:
run_umbp_single_node_hicache:
type: boolean
default: false
description: Run UMBP single-node hicache smoke test
sglang_repo:
type: string
default: "sgl-team/sglang"
description: owner/repo to checkout for SGLang
sglang_ref:
type: string
default: "main"
description: branch, tag, or SHA for SGLang checkout
enable_dp:
type: boolean
default: false
description: Enable DP/EP modes when launching the hicache test
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
env:
IMAGE: rocm/mori:ci
BASE_IMAGE: rocm/pytorch:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.8.0
CONTAINER: mori_ci_${{ github.run_id }}
CT: docker
jobs:
build:
name: mori build (${{ matrix.platform }})
runs-on: ${{ matrix.runner }}
strategy:
fail-fast: false
matrix:
include:
- platform: MI355X_AINIC
runner: [self-hosted, MI355X-AINIC-TW]
- platform: MI300X_BNXT
runner: [self-hosted, MI300X-BNXT]
timeout-minutes: 15
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Build CI image
run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev .
- name: Start container
run: |
$CT rm -f $CONTAINER 2>/dev/null || true
CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \
-v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \
-w $GITHUB_WORKSPACE \
$IMAGE sleep infinity
$CT exec $CONTAINER \
git config --global --add safe.directory $GITHUB_WORKSPACE
- name: Build and install mori
run: $CT exec $CONTAINER pip install .
- name: Build and install mori (with examples)
run: $CT exec $CONTAINER bash -c "BUILD_EXAMPLES=ON pip install ."
- name: Verify installation
run: $CT exec $CONTAINER python -c "import mori; print('mori ' + mori.__version__ + ' OK')"
- name: Cleanup
if: always()
run: |
$CT rm -f $CONTAINER || true
$CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE $BASE_IMAGE \
chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true
intranode-test:
name: mori intranode test (${{ matrix.platform }})
needs: build
runs-on: ${{ matrix.runner }}
strategy:
fail-fast: false
matrix:
include:
- platform: MI355X_AINIC
runner: [self-hosted, MI355X-AINIC-TW]
rdma_devices: rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
rdma_sl: 3
rdma_tc: 104
- platform: MI300X_BNXT
runner: [self-hosted, MI300X-BNXT]
rdma_devices: bnxt_re0,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8,bnxt_re9
rdma_sl: 3
rdma_tc: 104
socket_ifname: enp159s0np0
env:
MORI_RDMA_DEVICES: ${{ matrix.rdma_devices }}
MORI_RDMA_SL: ${{ matrix.rdma_sl }}
MORI_RDMA_TC: ${{ matrix.rdma_tc }}
MORI_SOCKET_IFNAME: ${{ matrix.socket_ifname }}
timeout-minutes: 45
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Build CI image
run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev .
- name: Start container
run: |
$CT rm -f $CONTAINER 2>/dev/null || true
CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \
-e MORI_RDMA_DEVICES=$MORI_RDMA_DEVICES \
-e MORI_RDMA_SL=$MORI_RDMA_SL \
-e MORI_RDMA_TC=$MORI_RDMA_TC \
${MORI_SOCKET_IFNAME:+-e MORI_SOCKET_IFNAME=$MORI_SOCKET_IFNAME} \
-v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \
-w $GITHUB_WORKSPACE \
$IMAGE sleep infinity
$CT exec $CONTAINER \
git config --global --add safe.directory $GITHUB_WORKSPACE
- name: Install mori
run: |
$CT exec $CONTAINER bash -c \
"cd $GITHUB_WORKSPACE && BUILD_BENCHMARK=ON BUILD_EXAMPLES=ON pip install . && pip install prettytable"
- name: MORI-EP (intranode)
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE && timeout 360 pytest tests/python/ops/test_dispatch_combine_intranode.py -v
"
- name: MORI-EP (internode_v1)
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE && timeout 300 pytest tests/python/ops/test_dispatch_combine_internode_v1.py -v
"
- name: MORI-EP (routing handle)
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE && timeout 300 pytest tests/python/ops/test_dispatch_combine_routing_handle.py -v
"
- name: MORI-EP (async_ll SDMA)
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE && MORI_ENABLE_SDMA=1 timeout 300 pytest tests/python/ops/test_dispatch_combine_async_ll.py -v
"
- name: MORI-EP (async_ll IBGDA)
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE && MORI_DISABLE_P2P=1 MORI_ENABLE_SDMA=0 timeout 300 pytest tests/python/ops/test_dispatch_combine_async_ll.py -v
"
- name: MORI-EP bench
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
timeout 120 python3 tests/python/ops/bench_dispatch_combine.py
timeout 120 python3 tests/python/ops/bench_dispatch_combine.py \
--cmd bench --dtype bf16 --quant-type fp8_blockwise \
--zero-copy 0 --max-tokens 128 \
--force-scale-active 1 --report-scale-stats 1
"
- name: MORI-IO
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
timeout 60 pytest tests/python/io/ -v
MORI_IO_XGMI_SCATTER_GATHER_THRESHOLD=4 \
timeout 120 pytest tests/python/io/test_discrete_buffer.py -v -k 'not performance'
"
- name: MORI-IR
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE -e HOME=/tmp $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
timeout 60 torchrun --nproc_per_node=2 examples/shmem/ir/test_triton_shmem.py
timeout 60 torchrun --nproc_per_node=8 examples/shmem/ir/test_triton_allreduce.py
MORI_DISABLE_P2P=ON timeout 60 torchrun --nproc_per_node=8 examples/shmem/ir/test_triton_allreduce.py
"
- name: MORI-CPP shmem_benchmark (P2P)
run: |
$CT exec $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s warp
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s warp
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s warp
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s warp
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s thread
"
- name: MORI-CPP shmem_benchmark (RDMA)
run: |
$CT exec $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s warp
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s warp
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s warp
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s warp
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s thread
"
- name: MORI-CPP shmem (P2P)
run: |
$CT exec $CONTAINER bash -c "
set -e
cd $GITHUB_WORKSPACE
timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_get_thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_imm_thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_signal_thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_nonfetch_thread
timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_fetch_thread
"
- name: MORI-CPP shmem (IBGDA)
run: |
$CT exec $CONTAINER bash -c "
set -e
cd $GITHUB_WORKSPACE
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_get_thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_imm_thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_signal_thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_nonfetch_thread
MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_fetch_thread
"
- name: MORI-CCL/shmem
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
timeout 600 pytest tests/python/shmem/test_api.py -v
"
- name: MORI-CCL collectives
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
MORI_ENABLE_SDMA=1 timeout 300 python -m tests.python.ccl.test_allgather --world-size 8 --elems 1024 --iterations 1 --warmup 0
MORI_ENABLE_SDMA=1 timeout 300 python -m tests.python.ccl.test_all2all --world-size 8 --elems 1024 --iterations 1 --warmup 0
"
- name: MORI-EP async kernel bench (intranode)
run: |
$CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
PORT=12390
for TOKENS in 64 128; do
echo \"=== async_ll bench max-tokens=\$TOKENS port=\$PORT ===\"
MORI_ENABLE_SDMA=1 GPU_PER_NODE=8 \
timeout 120 torchrun \
--nnodes=1 --node_rank=0 --nproc_per_node=1 \
--master_addr=127.0.0.1 --master_port=\$PORT \
examples/ops/dispatch_combine/test_dispatch_combine_internode.py \
--kernel-type async_ll --num-qp 2 --cmd bench \
--dtype bf16 --max-tokens \$TOKENS
PORT=\$((PORT + 1))
done
"
- name: Cleanup
if: always()
run: |
$CT rm -f $CONTAINER || true
if $CT image inspect $IMAGE &>/dev/null; then
$CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE $IMAGE \
chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true
fi
jax-intranode-test:
name: mori JAX intranode test (${{ matrix.platform }})
needs: build
runs-on: ${{ matrix.runner }}
strategy:
fail-fast: false
matrix:
include:
- platform: MI355X_AINIC
runner: [self-hosted, MI355X-AINIC-TW]
rdma_devices: rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
gpu_arch: gfx950
- platform: MI300X_BNXT
runner: [self-hosted, MI300X-BNXT]
rdma_devices: bnxt_re0,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8,bnxt_re9
gpu_arch: gfx942
timeout-minutes: 90
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Build JAX CI image
run: |
BUILD_ARGS="--network=host --build-arg GPU_TARGET=${{ matrix.gpu_arch }}"
if [ "$CT" = "podman" ]; then
BUILD_ARGS="$BUILD_ARGS --isolation=chroot"
fi
$CT build $BUILD_ARGS \
-t rocm/mori:jax-ci \
-f docker/Dockerfile.jax091-rocm711 .
- name: Start container
run: |
$CT rm -f ${CONTAINER}_jax 2>/dev/null || true
CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name ${CONTAINER}_jax \
-e MORI_RDMA_DEVICES=${{ matrix.rdma_devices }} \
-v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \
-w $GITHUB_WORKSPACE \
rocm/mori:jax-ci sleep infinity
$CT exec ${CONTAINER}_jax \
git config --global --add safe.directory $GITHUB_WORKSPACE
- name: Install mori (XLA FFI)
run: |
$CT exec ${CONTAINER}_jax bash -c \
"cd $GITHUB_WORKSPACE && BUILD_UMBP=OFF BUILD_XLA_FFI_OPS=ON pip install --break-system-packages ."
- name: MORI-EP JAX test
run: |
$CT exec ${CONTAINER}_jax bash -c "
cd $GITHUB_WORKSPACE
MORI_KERNEL_DIR=\$(ls -d $GITHUB_WORKSPACE/build/lib/gfx* | head -1) \
timeout 300 pytest tests/python/ops/test_dispatch_combine_jax.py -s -v
"
- name: Cleanup
if: always()
run: |
$CT rm -f ${CONTAINER}_jax || true
if $CT image inspect rocm/mori:jax-ci &>/dev/null; then
$CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE rocm/mori:jax-ci \
chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true
fi
umbp-unit-test:
name: mori umbp unit test (${{ matrix.platform }})
needs: build
runs-on: ${{ matrix.runner }}
strategy:
fail-fast: false
matrix:
include:
- platform: MI355X_AINIC
runner: [self-hosted, MI355X-AINIC-TW]
timeout-minutes: 30
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Build CI image
run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev .
- name: Start container
run: |
$CT rm -f $CONTAINER 2>/dev/null || true
CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \
-v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \
-w $GITHUB_WORKSPACE \
$IMAGE sleep infinity
$CT exec $CONTAINER \
git config --global --add safe.directory $GITHUB_WORKSPACE
- name: Build mori with UMBP
run: |
$CT exec $CONTAINER bash -c \
"cd $GITHUB_WORKSPACE && BUILD_UMBP=ON BUILD_TESTS=ON pip install -e ."
- name: UMBP Python packaging tests
run: |
$CT exec $CONTAINER bash -c "
cd $GITHUB_WORKSPACE
timeout 60 pytest tests/python/umbp/ -v
"
- name: UMBP C++ unit tests (local + distributed)
run: |
$CT exec $CONTAINER bash -c "
cd $GITHUB_WORKSPACE/build
timeout 300 ctest --output-on-failure \
-R '^(umbp_|test_dummy_ssd_tier|test_prefix_aware_eviction|test_ssd_tier)'
"
- name: Cleanup
if: always()
run: |
$CT exec $CONTAINER chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true
$CT rm -f $CONTAINER || true
internode-test:
name: mori internode test (${{ matrix.platform }})
needs: build
runs-on: ${{ matrix.runner }}
strategy:
fail-fast: false
matrix:
include:
- platform: MI355X_AINIC
runner: [self-hosted, MI355X-AINIC]
node1_host: 10.2.80.22
node2_host: 10.2.80.20
node2_port: 22
node1_ifname: enp193s0f1np1
node2_ifname: enp193s0f1np1
rdma_devices: rocep105s0,rocep121s0,rocep137s0,rocep153s0,rocep233s0,rocep249s0,rocep25s0,rocep9s0
rdma_sl: 3
rdma_tc: 104
ct: podman
- platform: MI300X_BNXT
runner: [self-hosted, MI300X-BNXT]
node1_host: 10.245.128.61
node2_host: 10.245.128.59
node2_port: 22
node1_ifname: enp159s0np0
node2_ifname: enp159s0np0
rdma_devices: bnxt_re0,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8,bnxt_re9
rdma_sl: 3
rdma_tc: 104
ct: docker
env:
CT: ${{ matrix.ct }}
SSH_OPTS: -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null
NODE1_HOST: ${{ matrix.node1_host }}
NODE2_HOST: ${{ matrix.node2_host }}
NODE2_PORT: ${{ matrix.node2_port }}
NODE1_IFNAME: ${{ matrix.node1_ifname }}
NODE2_IFNAME: ${{ matrix.node2_ifname }}
MORI_RDMA_DEVICES: ${{ matrix.rdma_devices }}
MORI_RDMA_SL: ${{ matrix.rdma_sl }}
MORI_RDMA_TC: ${{ matrix.rdma_tc }}
MORI_SOCKET_IFNAME: ${{ matrix.node1_ifname }}
timeout-minutes: 45
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: true
- name: Build CI image
run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev .
- name: Start container on node1
run: |
$CT rm -f $CONTAINER 2>/dev/null || true
CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \
-e MORI_RDMA_DEVICES=$MORI_RDMA_DEVICES \
-e MORI_RDMA_SL=$MORI_RDMA_SL \
-e MORI_RDMA_TC=$MORI_RDMA_TC \
-e MORI_SOCKET_IFNAME=$MORI_SOCKET_IFNAME \
-e GLOO_SOCKET_IFNAME=$NODE1_IFNAME \
-e NCCL_SOCKET_IFNAME=$NODE1_IFNAME \
-v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \
-w $GITHUB_WORKSPACE \
$IMAGE sleep infinity
$CT exec $CONTAINER \
git config --global --add safe.directory $GITHUB_WORKSPACE
- name: Start container on node2
run: |
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "
$CT rm -f $CONTAINER 2>/dev/null || true
mkdir -p $GITHUB_WORKSPACE
"
rsync -az --exclude='.git' \
-e "ssh $SSH_OPTS -p $NODE2_PORT" \
$GITHUB_WORKSPACE/ \
$(whoami)@$NODE2_HOST:$GITHUB_WORKSPACE/
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "
cd $GITHUB_WORKSPACE &&
$CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev . &&
CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \
-e MORI_RDMA_DEVICES=$MORI_RDMA_DEVICES \
-e MORI_RDMA_SL=$MORI_RDMA_SL \
-e MORI_RDMA_TC=$MORI_RDMA_TC \
-e MORI_SOCKET_IFNAME=$MORI_SOCKET_IFNAME \
-e GLOO_SOCKET_IFNAME=$NODE2_IFNAME \
-e NCCL_SOCKET_IFNAME=$NODE2_IFNAME \
-v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \
-w $GITHUB_WORKSPACE \
$IMAGE sleep infinity &&
$CT exec $CONTAINER \
git config --global --add safe.directory $GITHUB_WORKSPACE
"
- name: Install mori (node1)
run: |
$CT exec $CONTAINER bash -c \
"cd $GITHUB_WORKSPACE && BUILD_EXAMPLES=ON pip install . && pip install prettytable && (command -v numactl >/dev/null || (apt-get update && apt-get install -y numactl))"
- name: Install mori (node2)
run: |
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST \
"$CT exec $CONTAINER bash -c 'cd $GITHUB_WORKSPACE && BUILD_EXAMPLES=ON pip install . && pip install prettytable && (command -v numactl >/dev/null || (apt-get update && apt-get install -y numactl))'"
- name: MORI-IO internode write sweep
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh"
PORT=29120
echo "=== MORI-IO internode benchmark: wide-write-sweep port=$PORT ==="
NODE2_CMD=(
$CT exec $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
-- --op-type write --transfer-batch-size 128 --all
--sweep-start-size 1024 --sweep-max-size 16777216 --iters 4
--enable-sess --enable-batch-transfer
--num-qp-per-transfer 2
--num-initiator-dev 8 --num-target-dev 8
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
-- --op-type write --transfer-batch-size 128 --all \
--sweep-start-size 1024 --sweep-max-size 16777216 --iters 4 \
--enable-sess --enable-batch-transfer \
--num-qp-per-transfer 2 \
--num-initiator-dev 8 --num-target-dev 8
wait
- name: MORI-IO internode read
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh"
PORT=29121
echo "=== MORI-IO internode benchmark: batch-session-read port=$PORT ==="
NODE2_CMD=(
$CT exec $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
-- --op-type read --buffer-size 4096 --transfer-batch-size 128 --iters 8
--enable-batch-transfer --enable-sess --poll_cq_mode event
--num-qp-per-transfer 2
--num-initiator-dev 8 --num-target-dev 8
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
-- --op-type read --buffer-size 4096 --transfer-batch-size 128 --iters 8 \
--enable-batch-transfer --enable-sess --poll_cq_mode event \
--num-qp-per-transfer 2 \
--num-initiator-dev 8 --num-target-dev 8
wait
- name: MORI-IO internode CPU memory sweep (chunking + 4 NIC x 4 QP)
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh"
# CPU host memory, chunking on (default), striped across 4 NUMA-local NICs
# with 16 QPs total (= 4 QP per NIC). Single posting thread (inline).
PORT=29122
for OP in write read; do
echo "=== MORI-IO internode benchmark: cpu-mem $OP sweep (chunking, 4 NIC x 4 QP) port=$PORT ==="
NODE2_CMD=(
$CT exec -e MORI_IO_NUM_NICS_PER_TRANSFER=4 $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME --numa 0
-- --mem-type cpu --op-type $OP --transfer-batch-size 64 --all
--sweep-start-size 1024 --sweep-max-size 1048576 --iters 4
--enable-sess --enable-batch-transfer
--num-qp-per-transfer 16 --num-initiator-dev 1 --num-target-dev 1
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec -e MORI_IO_NUM_NICS_PER_TRANSFER=4 $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME --numa 0 \
-- --mem-type cpu --op-type $OP --transfer-batch-size 64 --all \
--sweep-start-size 1024 --sweep-max-size 1048576 --iters 4 \
--enable-sess --enable-batch-transfer \
--num-qp-per-transfer 16 --num-initiator-dev 1 --num-target-dev 1
wait
PORT=$((PORT + 1))
done
- name: MORI-IO internode multi-worker executor correctness (chunking off)
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh"
PORT=29124
# Legacy multi-worker executor path: only active when chunking is off and
# single NIC. Small write run to guard byte-for-byte correctness of that path.
echo "=== MORI-IO internode benchmark: multi-worker executor (chunking off) port=$PORT ==="
NODE2_CMD=(
$CT exec $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
-- --op-type write --disable-chunking --transfer-batch-size 64 --all
--sweep-start-size 1024 --sweep-max-size 1048576 --iters 4
--enable-sess --enable-batch-transfer
--num-qp-per-transfer 4 --num-worker-threads 2
--num-initiator-dev 8 --num-target-dev 8
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
-- --op-type write --disable-chunking --transfer-batch-size 64 --all \
--sweep-start-size 1024 --sweep-max-size 1048576 --iters 4 \
--enable-sess --enable-batch-transfer \
--num-qp-per-transfer 4 --num-worker-threads 2 \
--num-initiator-dev 8 --num-target-dev 8
wait
- name: MORI-IO internode cross-rail write (rail affinity)
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh"
PORT=29125
# Cross-rail validation: --target-dev-offset 5 makes GPU-0 pair with GPU-5
# (different NUMA → different NIC preference). With MORI_IO_RAIL_AFFINITY=1
# the receiver must follow the sender's railId, keeping traffic on-rail.
# On a rail-isolated fabric this would fail without rail affinity.
echo "=== MORI-IO internode: cross-rail GPU write (offset=5, rail affinity ON) ==="
NODE2_CMD=(
$CT exec -e MORI_IO_RAIL_AFFINITY=1 $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
-- --op-type write --buffer-size 4096 --transfer-batch-size 64
--iters 4 --enable-sess --enable-batch-transfer
--num-qp-per-transfer 2
--num-initiator-dev 8 --num-target-dev 8
--target-dev-offset 5
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec -e MORI_IO_RAIL_AFFINITY=1 $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
-- --op-type write --buffer-size 4096 --transfer-batch-size 64 \
--iters 4 --enable-sess --enable-batch-transfer \
--num-qp-per-transfer 2 \
--num-initiator-dev 8 --num-target-dev 8 \
--target-dev-offset 5
wait
- name: MORI-EP internode normal kernel bench
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_test.sh"
PORT=29140
for KERNEL in v1 v1_ll; do
for TOKENS in 64 128 1024 2048 4096; do
echo "=== bench kernel=$KERNEL max-tokens=$TOKENS port=$PORT ==="
NODE2_CMD=(
$CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
--cmd bench --kernel-type $KERNEL --max-tokens $TOKENS
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
--cmd bench --kernel-type $KERNEL --max-tokens $TOKENS
wait
sleep 1
PORT=$((PORT + 1))
done
done
- name: MORI-EP internode normal kernel stress
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_test.sh"
PORT=29160
for KERNEL in v1 v1_ll; do
for TOKENS in 64 128 1024 2048 4096; do
echo "=== stress kernel=$KERNEL max-tokens=$TOKENS port=$PORT ==="
NODE2_CMD=(
$CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
--cmd stress --kernel-type $KERNEL --max-tokens $TOKENS
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
--cmd stress --kernel-type $KERNEL --max-tokens $TOKENS
wait
sleep 1
PORT=$((PORT + 1))
done
done
- name: MORI-EP internode async kernel test
run: |
SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_test.sh"
PORT=29180
for TOKENS in 64 128; do
echo "=== async_ll test max-tokens=$TOKENS port=$PORT ==="
NODE2_CMD=(
$CT exec
-e GPU_PER_NODE=8 -e MORI_ENABLE_SDMA=1 -e MORI_INTERNODE_TIMEOUT=600
$CONTAINER bash $SCRIPT
--rank 1 --master-addr $NODE1_HOST --master-port $PORT
--ifname $NODE2_IFNAME
--cmd test --kernel-type async_ll
--quant-type none --dtype bf16 --max-tokens $TOKENS
)
ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" &
$CT exec \
-e GPU_PER_NODE=8 \
-e MORI_ENABLE_SDMA=1 \
-e MORI_INTERNODE_TIMEOUT=600 \
$CONTAINER bash $SCRIPT \
--rank 0 --master-addr $NODE1_HOST --master-port $PORT \
--ifname $NODE1_IFNAME \
--cmd test --kernel-type async_ll \
--quant-type none --dtype bf16 --max-tokens $TOKENS
wait
sleep 1
PORT=$((PORT + 1))
done
- name: Cleanup node1
if: always()
run: |
$CT rm -f $CONTAINER || true
if $CT image inspect $IMAGE &>/dev/null; then
$CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE $IMAGE \
chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true
fi
- name: Cleanup node2
if: always()
run: |
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
-p $NODE2_PORT $(whoami)@$NODE2_HOST \
"$CT rm -f $CONTAINER || true"
umbp-single-node-hicache:
name: mori umbp single-node hicache smoke test
needs: build
if: github.event_name == 'workflow_dispatch' && inputs.run_umbp_single_node_hicache == 'true'
runs-on: [self-hosted, MI355X-AINIC-TW]
timeout-minutes: 180
steps:
- name: Checkout MoRI
uses: actions/checkout@v4
with:
submodules: true
- name: Checkout SGLang
uses: actions/checkout@v4
with:
repository: ${{ inputs.sglang_repo }}
ref: ${{ inputs.sglang_ref }}
path: sglang
fetch-depth: 1
- name: Ensure hicache script is executable
run: chmod +x src/umbp/scripts/run_umbp_single_node_hicache.sh
- name: Run hicache smoke test
env:
SGLANG_REPO: ${{ github.workspace }}/sglang
MORI_REPO: ${{ github.workspace }}
RESULTS_DIR: ${{ github.workspace }}/umbp_single_node_results
USE_DUMMY_WEIGHTS: "true"
RUN_GSM8K: "false"
MODEL_PATH: ${{ github.workspace }}/umbp_dummy_model
ENABLE_DP: ${{ inputs.enable_dp }}
START_UMBP_MASTER: "false"
run: |
set -euo pipefail
mkdir -p "$RESULTS_DIR"
./src/umbp/scripts/run_umbp_single_node_hicache.sh
- name: Upload hicache artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: umbp-single-node-hicache-${{ github.run_id }}
path: |
${{ github.workspace }}/umbp_single_node_results
if-no-files-found: warn
- name: Cleanup hicache container
if: always()
run: docker rm -f umbp-single-node >/dev/null 2>&1 || true