Feature: add umbp benchmark / distributed mode flush API / hot-path o… #1281
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: MoRI CI test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| branches: [main] | |
| workflow_dispatch: | |
| inputs: | |
| run_umbp_single_node_hicache: | |
| type: boolean | |
| default: false | |
| description: Run UMBP single-node hicache smoke test | |
| sglang_repo: | |
| type: string | |
| default: "sgl-team/sglang" | |
| description: owner/repo to checkout for SGLang | |
| sglang_ref: | |
| type: string | |
| default: "main" | |
| description: branch, tag, or SHA for SGLang checkout | |
| enable_dp: | |
| type: boolean | |
| default: false | |
| description: Enable DP/EP modes when launching the hicache test | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| IMAGE: rocm/mori:ci | |
| BASE_IMAGE: rocm/pytorch:rocm7.2.4_ubuntu24.04_py3.12_pytorch_release_2.8.0 | |
| CONTAINER: mori_ci_${{ github.run_id }} | |
| CT: docker | |
| jobs: | |
| build: | |
| name: mori build (${{ matrix.platform }}) | |
| runs-on: ${{ matrix.runner }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - platform: MI355X_AINIC | |
| runner: [self-hosted, MI355X-AINIC-TW] | |
| - platform: MI300X_BNXT | |
| runner: [self-hosted, MI300X-BNXT] | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: true | |
| - name: Build CI image | |
| run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev . | |
| - name: Start container | |
| run: | | |
| $CT rm -f $CONTAINER 2>/dev/null || true | |
| CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \ | |
| -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \ | |
| -w $GITHUB_WORKSPACE \ | |
| $IMAGE sleep infinity | |
| $CT exec $CONTAINER \ | |
| git config --global --add safe.directory $GITHUB_WORKSPACE | |
| - name: Build and install mori | |
| run: $CT exec $CONTAINER pip install . | |
| - name: Build and install mori (with examples) | |
| run: $CT exec $CONTAINER bash -c "BUILD_EXAMPLES=ON pip install ." | |
| - name: Verify installation | |
| run: $CT exec $CONTAINER python -c "import mori; print('mori ' + mori.__version__ + ' OK')" | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| $CT rm -f $CONTAINER || true | |
| $CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE $BASE_IMAGE \ | |
| chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true | |
| intranode-test: | |
| name: mori intranode test (${{ matrix.platform }}) | |
| needs: build | |
| runs-on: ${{ matrix.runner }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - platform: MI355X_AINIC | |
| runner: [self-hosted, MI355X-AINIC-TW] | |
| rdma_devices: rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 | |
| rdma_sl: 3 | |
| rdma_tc: 104 | |
| - platform: MI300X_BNXT | |
| runner: [self-hosted, MI300X-BNXT] | |
| rdma_devices: bnxt_re0,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8,bnxt_re9 | |
| rdma_sl: 3 | |
| rdma_tc: 104 | |
| socket_ifname: enp159s0np0 | |
| env: | |
| MORI_RDMA_DEVICES: ${{ matrix.rdma_devices }} | |
| MORI_RDMA_SL: ${{ matrix.rdma_sl }} | |
| MORI_RDMA_TC: ${{ matrix.rdma_tc }} | |
| MORI_SOCKET_IFNAME: ${{ matrix.socket_ifname }} | |
| timeout-minutes: 45 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: true | |
| - name: Build CI image | |
| run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev . | |
| - name: Start container | |
| run: | | |
| $CT rm -f $CONTAINER 2>/dev/null || true | |
| CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \ | |
| -e MORI_RDMA_DEVICES=$MORI_RDMA_DEVICES \ | |
| -e MORI_RDMA_SL=$MORI_RDMA_SL \ | |
| -e MORI_RDMA_TC=$MORI_RDMA_TC \ | |
| ${MORI_SOCKET_IFNAME:+-e MORI_SOCKET_IFNAME=$MORI_SOCKET_IFNAME} \ | |
| -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \ | |
| -w $GITHUB_WORKSPACE \ | |
| $IMAGE sleep infinity | |
| $CT exec $CONTAINER \ | |
| git config --global --add safe.directory $GITHUB_WORKSPACE | |
| - name: Install mori | |
| run: | | |
| $CT exec $CONTAINER bash -c \ | |
| "cd $GITHUB_WORKSPACE && BUILD_BENCHMARK=ON BUILD_EXAMPLES=ON pip install . && pip install prettytable" | |
| - name: MORI-EP (intranode) | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE && timeout 360 pytest tests/python/ops/test_dispatch_combine_intranode.py -v | |
| " | |
| - name: MORI-EP (internode_v1) | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE && timeout 300 pytest tests/python/ops/test_dispatch_combine_internode_v1.py -v | |
| " | |
| - name: MORI-EP (routing handle) | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE && timeout 300 pytest tests/python/ops/test_dispatch_combine_routing_handle.py -v | |
| " | |
| - name: MORI-EP (async_ll SDMA) | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE && MORI_ENABLE_SDMA=1 timeout 300 pytest tests/python/ops/test_dispatch_combine_async_ll.py -v | |
| " | |
| - name: MORI-EP (async_ll IBGDA) | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE && MORI_DISABLE_P2P=1 MORI_ENABLE_SDMA=0 timeout 300 pytest tests/python/ops/test_dispatch_combine_async_ll.py -v | |
| " | |
| - name: MORI-EP bench | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| timeout 120 python3 tests/python/ops/bench_dispatch_combine.py | |
| timeout 120 python3 tests/python/ops/bench_dispatch_combine.py \ | |
| --cmd bench --dtype bf16 --quant-type fp8_blockwise \ | |
| --zero-copy 0 --max-tokens 128 \ | |
| --force-scale-active 1 --report-scale-stats 1 | |
| " | |
| - name: MORI-IO | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| timeout 60 pytest tests/python/io/ -v | |
| MORI_IO_XGMI_SCATTER_GATHER_THRESHOLD=4 \ | |
| timeout 120 pytest tests/python/io/test_discrete_buffer.py -v -k 'not performance' | |
| " | |
| - name: MORI-IR | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE -e HOME=/tmp $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| timeout 60 torchrun --nproc_per_node=2 examples/shmem/ir/test_triton_shmem.py | |
| timeout 60 torchrun --nproc_per_node=8 examples/shmem/ir/test_triton_allreduce.py | |
| MORI_DISABLE_P2P=ON timeout 60 torchrun --nproc_per_node=8 examples/shmem/ir/test_triton_allreduce.py | |
| " | |
| - name: MORI-CPP shmem_benchmark (P2P) | |
| run: | | |
| $CT exec $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s warp | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s warp | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s warp | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s warp | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s thread | |
| " | |
| - name: MORI-CPP shmem_benchmark (RDMA) | |
| run: | | |
| $CT exec $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s warp | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s warp | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s warp | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s warp | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_bw -s thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_bw -s thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_put_latency -s thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/benchmark/p2p_get_latency -s thread | |
| " | |
| - name: MORI-CPP shmem (P2P) | |
| run: | | |
| $CT exec $CONTAINER bash -c " | |
| set -e | |
| cd $GITHUB_WORKSPACE | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_get_thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_imm_thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_signal_thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_nonfetch_thread | |
| timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_fetch_thread | |
| " | |
| - name: MORI-CPP shmem (IBGDA) | |
| run: | | |
| $CT exec $CONTAINER bash -c " | |
| set -e | |
| cd $GITHUB_WORKSPACE | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_get_thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_imm_thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/concurrent_put_signal_thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_nonfetch_thread | |
| MORI_DISABLE_P2P=ON timeout 60 mpirun --allow-run-as-root -np 2 ./build/examples/atomic_fetch_thread | |
| " | |
| - name: MORI-CCL/shmem | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| timeout 600 pytest tests/python/shmem/test_api.py -v | |
| " | |
| - name: MORI-CCL collectives | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| MORI_ENABLE_SDMA=1 timeout 300 python -m tests.python.ccl.test_allgather --world-size 8 --elems 1024 --iterations 1 --warmup 0 | |
| MORI_ENABLE_SDMA=1 timeout 300 python -m tests.python.ccl.test_all2all --world-size 8 --elems 1024 --iterations 1 --warmup 0 | |
| " | |
| - name: MORI-EP async kernel bench (intranode) | |
| run: | | |
| $CT exec -e PYTHONPATH=$GITHUB_WORKSPACE $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| PORT=12390 | |
| for TOKENS in 64 128; do | |
| echo \"=== async_ll bench max-tokens=\$TOKENS port=\$PORT ===\" | |
| MORI_ENABLE_SDMA=1 GPU_PER_NODE=8 \ | |
| timeout 120 torchrun \ | |
| --nnodes=1 --node_rank=0 --nproc_per_node=1 \ | |
| --master_addr=127.0.0.1 --master_port=\$PORT \ | |
| examples/ops/dispatch_combine/test_dispatch_combine_internode.py \ | |
| --kernel-type async_ll --num-qp 2 --cmd bench \ | |
| --dtype bf16 --max-tokens \$TOKENS | |
| PORT=\$((PORT + 1)) | |
| done | |
| " | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| $CT rm -f $CONTAINER || true | |
| if $CT image inspect $IMAGE &>/dev/null; then | |
| $CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE $IMAGE \ | |
| chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true | |
| fi | |
| jax-intranode-test: | |
| name: mori JAX intranode test (${{ matrix.platform }}) | |
| needs: build | |
| runs-on: ${{ matrix.runner }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - platform: MI355X_AINIC | |
| runner: [self-hosted, MI355X-AINIC-TW] | |
| rdma_devices: rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 | |
| gpu_arch: gfx950 | |
| - platform: MI300X_BNXT | |
| runner: [self-hosted, MI300X-BNXT] | |
| rdma_devices: bnxt_re0,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8,bnxt_re9 | |
| gpu_arch: gfx942 | |
| timeout-minutes: 90 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: true | |
| - name: Build JAX CI image | |
| run: | | |
| BUILD_ARGS="--network=host --build-arg GPU_TARGET=${{ matrix.gpu_arch }}" | |
| if [ "$CT" = "podman" ]; then | |
| BUILD_ARGS="$BUILD_ARGS --isolation=chroot" | |
| fi | |
| $CT build $BUILD_ARGS \ | |
| -t rocm/mori:jax-ci \ | |
| -f docker/Dockerfile.jax091-rocm711 . | |
| - name: Start container | |
| run: | | |
| $CT rm -f ${CONTAINER}_jax 2>/dev/null || true | |
| CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name ${CONTAINER}_jax \ | |
| -e MORI_RDMA_DEVICES=${{ matrix.rdma_devices }} \ | |
| -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \ | |
| -w $GITHUB_WORKSPACE \ | |
| rocm/mori:jax-ci sleep infinity | |
| $CT exec ${CONTAINER}_jax \ | |
| git config --global --add safe.directory $GITHUB_WORKSPACE | |
| - name: Install mori (XLA FFI) | |
| run: | | |
| $CT exec ${CONTAINER}_jax bash -c \ | |
| "cd $GITHUB_WORKSPACE && BUILD_UMBP=OFF BUILD_XLA_FFI_OPS=ON pip install --break-system-packages ." | |
| - name: MORI-EP JAX test | |
| run: | | |
| $CT exec ${CONTAINER}_jax bash -c " | |
| cd $GITHUB_WORKSPACE | |
| MORI_KERNEL_DIR=\$(ls -d $GITHUB_WORKSPACE/build/lib/gfx* | head -1) \ | |
| timeout 300 pytest tests/python/ops/test_dispatch_combine_jax.py -s -v | |
| " | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| $CT rm -f ${CONTAINER}_jax || true | |
| if $CT image inspect rocm/mori:jax-ci &>/dev/null; then | |
| $CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE rocm/mori:jax-ci \ | |
| chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true | |
| fi | |
| umbp-unit-test: | |
| name: mori umbp unit test (${{ matrix.platform }}) | |
| needs: build | |
| runs-on: ${{ matrix.runner }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - platform: MI355X_AINIC | |
| runner: [self-hosted, MI355X-AINIC-TW] | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: true | |
| - name: Build CI image | |
| run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev . | |
| - name: Start container | |
| run: | | |
| $CT rm -f $CONTAINER 2>/dev/null || true | |
| CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \ | |
| -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \ | |
| -w $GITHUB_WORKSPACE \ | |
| $IMAGE sleep infinity | |
| $CT exec $CONTAINER \ | |
| git config --global --add safe.directory $GITHUB_WORKSPACE | |
| - name: Build mori with UMBP | |
| run: | | |
| $CT exec $CONTAINER bash -c \ | |
| "cd $GITHUB_WORKSPACE && BUILD_UMBP=ON BUILD_TESTS=ON pip install -e ." | |
| - name: UMBP Python packaging tests | |
| run: | | |
| $CT exec $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE | |
| timeout 60 pytest tests/python/umbp/ -v | |
| " | |
| - name: UMBP C++ unit tests (local + distributed) | |
| run: | | |
| $CT exec $CONTAINER bash -c " | |
| cd $GITHUB_WORKSPACE/build | |
| timeout 300 ctest --output-on-failure \ | |
| -R '^(umbp_|test_dummy_ssd_tier|test_prefix_aware_eviction|test_ssd_tier)' | |
| " | |
| - name: Cleanup | |
| if: always() | |
| run: | | |
| $CT exec $CONTAINER chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true | |
| $CT rm -f $CONTAINER || true | |
| internode-test: | |
| name: mori internode test (${{ matrix.platform }}) | |
| needs: build | |
| runs-on: ${{ matrix.runner }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - platform: MI355X_AINIC | |
| runner: [self-hosted, MI355X-AINIC] | |
| node1_host: 10.2.80.22 | |
| node2_host: 10.2.80.20 | |
| node2_port: 22 | |
| node1_ifname: enp193s0f1np1 | |
| node2_ifname: enp193s0f1np1 | |
| rdma_devices: rocep105s0,rocep121s0,rocep137s0,rocep153s0,rocep233s0,rocep249s0,rocep25s0,rocep9s0 | |
| rdma_sl: 3 | |
| rdma_tc: 104 | |
| ct: podman | |
| - platform: MI300X_BNXT | |
| runner: [self-hosted, MI300X-BNXT] | |
| node1_host: 10.245.128.61 | |
| node2_host: 10.245.128.59 | |
| node2_port: 22 | |
| node1_ifname: enp159s0np0 | |
| node2_ifname: enp159s0np0 | |
| rdma_devices: bnxt_re0,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re7,bnxt_re8,bnxt_re9 | |
| rdma_sl: 3 | |
| rdma_tc: 104 | |
| ct: docker | |
| env: | |
| CT: ${{ matrix.ct }} | |
| SSH_OPTS: -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null | |
| NODE1_HOST: ${{ matrix.node1_host }} | |
| NODE2_HOST: ${{ matrix.node2_host }} | |
| NODE2_PORT: ${{ matrix.node2_port }} | |
| NODE1_IFNAME: ${{ matrix.node1_ifname }} | |
| NODE2_IFNAME: ${{ matrix.node2_ifname }} | |
| MORI_RDMA_DEVICES: ${{ matrix.rdma_devices }} | |
| MORI_RDMA_SL: ${{ matrix.rdma_sl }} | |
| MORI_RDMA_TC: ${{ matrix.rdma_tc }} | |
| MORI_SOCKET_IFNAME: ${{ matrix.node1_ifname }} | |
| timeout-minutes: 45 | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: true | |
| - name: Build CI image | |
| run: $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev . | |
| - name: Start container on node1 | |
| run: | | |
| $CT rm -f $CONTAINER 2>/dev/null || true | |
| CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \ | |
| -e MORI_RDMA_DEVICES=$MORI_RDMA_DEVICES \ | |
| -e MORI_RDMA_SL=$MORI_RDMA_SL \ | |
| -e MORI_RDMA_TC=$MORI_RDMA_TC \ | |
| -e MORI_SOCKET_IFNAME=$MORI_SOCKET_IFNAME \ | |
| -e GLOO_SOCKET_IFNAME=$NODE1_IFNAME \ | |
| -e NCCL_SOCKET_IFNAME=$NODE1_IFNAME \ | |
| -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \ | |
| -w $GITHUB_WORKSPACE \ | |
| $IMAGE sleep infinity | |
| $CT exec $CONTAINER \ | |
| git config --global --add safe.directory $GITHUB_WORKSPACE | |
| - name: Start container on node2 | |
| run: | | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST " | |
| $CT rm -f $CONTAINER 2>/dev/null || true | |
| mkdir -p $GITHUB_WORKSPACE | |
| " | |
| rsync -az --exclude='.git' \ | |
| -e "ssh $SSH_OPTS -p $NODE2_PORT" \ | |
| $GITHUB_WORKSPACE/ \ | |
| $(whoami)@$NODE2_HOST:$GITHUB_WORKSPACE/ | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST " | |
| cd $GITHUB_WORKSPACE && | |
| $CT build --network=host --build-arg BASE_IMAGE=$BASE_IMAGE -t $IMAGE -f docker/Dockerfile.dev . && | |
| CONTAINER_RUNTIME=$CT ./docker/ci_run.sh --name $CONTAINER \ | |
| -e MORI_RDMA_DEVICES=$MORI_RDMA_DEVICES \ | |
| -e MORI_RDMA_SL=$MORI_RDMA_SL \ | |
| -e MORI_RDMA_TC=$MORI_RDMA_TC \ | |
| -e MORI_SOCKET_IFNAME=$MORI_SOCKET_IFNAME \ | |
| -e GLOO_SOCKET_IFNAME=$NODE2_IFNAME \ | |
| -e NCCL_SOCKET_IFNAME=$NODE2_IFNAME \ | |
| -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE \ | |
| -w $GITHUB_WORKSPACE \ | |
| $IMAGE sleep infinity && | |
| $CT exec $CONTAINER \ | |
| git config --global --add safe.directory $GITHUB_WORKSPACE | |
| " | |
| - name: Install mori (node1) | |
| run: | | |
| $CT exec $CONTAINER bash -c \ | |
| "cd $GITHUB_WORKSPACE && BUILD_EXAMPLES=ON pip install . && pip install prettytable && (command -v numactl >/dev/null || (apt-get update && apt-get install -y numactl))" | |
| - name: Install mori (node2) | |
| run: | | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST \ | |
| "$CT exec $CONTAINER bash -c 'cd $GITHUB_WORKSPACE && BUILD_EXAMPLES=ON pip install . && pip install prettytable && (command -v numactl >/dev/null || (apt-get update && apt-get install -y numactl))'" | |
| - name: MORI-IO internode write sweep | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh" | |
| PORT=29120 | |
| echo "=== MORI-IO internode benchmark: wide-write-sweep port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| -- --op-type write --transfer-batch-size 128 --all | |
| --sweep-start-size 1024 --sweep-max-size 16777216 --iters 4 | |
| --enable-sess --enable-batch-transfer | |
| --num-qp-per-transfer 2 | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| -- --op-type write --transfer-batch-size 128 --all \ | |
| --sweep-start-size 1024 --sweep-max-size 16777216 --iters 4 \ | |
| --enable-sess --enable-batch-transfer \ | |
| --num-qp-per-transfer 2 \ | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| wait | |
| - name: MORI-IO internode read | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh" | |
| PORT=29121 | |
| echo "=== MORI-IO internode benchmark: batch-session-read port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| -- --op-type read --buffer-size 4096 --transfer-batch-size 128 --iters 8 | |
| --enable-batch-transfer --enable-sess --poll_cq_mode event | |
| --num-qp-per-transfer 2 | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| -- --op-type read --buffer-size 4096 --transfer-batch-size 128 --iters 8 \ | |
| --enable-batch-transfer --enable-sess --poll_cq_mode event \ | |
| --num-qp-per-transfer 2 \ | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| wait | |
| - name: MORI-IO internode CPU memory sweep (chunking + 4 NIC x 4 QP) | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh" | |
| # CPU host memory, chunking on (default), striped across 4 NUMA-local NICs | |
| # with 16 QPs total (= 4 QP per NIC). Single posting thread (inline). | |
| PORT=29122 | |
| for OP in write read; do | |
| echo "=== MORI-IO internode benchmark: cpu-mem $OP sweep (chunking, 4 NIC x 4 QP) port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec -e MORI_IO_NUM_NICS_PER_TRANSFER=4 $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME --numa 0 | |
| -- --mem-type cpu --op-type $OP --transfer-batch-size 64 --all | |
| --sweep-start-size 1024 --sweep-max-size 1048576 --iters 4 | |
| --enable-sess --enable-batch-transfer | |
| --num-qp-per-transfer 16 --num-initiator-dev 1 --num-target-dev 1 | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec -e MORI_IO_NUM_NICS_PER_TRANSFER=4 $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME --numa 0 \ | |
| -- --mem-type cpu --op-type $OP --transfer-batch-size 64 --all \ | |
| --sweep-start-size 1024 --sweep-max-size 1048576 --iters 4 \ | |
| --enable-sess --enable-batch-transfer \ | |
| --num-qp-per-transfer 16 --num-initiator-dev 1 --num-target-dev 1 | |
| wait | |
| PORT=$((PORT + 1)) | |
| done | |
| - name: MORI-IO internode multi-worker executor correctness (chunking off) | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh" | |
| PORT=29124 | |
| # Legacy multi-worker executor path: only active when chunking is off and | |
| # single NIC. Small write run to guard byte-for-byte correctness of that path. | |
| echo "=== MORI-IO internode benchmark: multi-worker executor (chunking off) port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| -- --op-type write --disable-chunking --transfer-batch-size 64 --all | |
| --sweep-start-size 1024 --sweep-max-size 1048576 --iters 4 | |
| --enable-sess --enable-batch-transfer | |
| --num-qp-per-transfer 4 --num-worker-threads 2 | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| -- --op-type write --disable-chunking --transfer-batch-size 64 --all \ | |
| --sweep-start-size 1024 --sweep-max-size 1048576 --iters 4 \ | |
| --enable-sess --enable-batch-transfer \ | |
| --num-qp-per-transfer 4 --num-worker-threads 2 \ | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| wait | |
| - name: MORI-IO internode cross-rail write (rail affinity) | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_io_benchmark.sh" | |
| PORT=29125 | |
| # Cross-rail validation: --target-dev-offset 5 makes GPU-0 pair with GPU-5 | |
| # (different NUMA → different NIC preference). With MORI_IO_RAIL_AFFINITY=1 | |
| # the receiver must follow the sender's railId, keeping traffic on-rail. | |
| # On a rail-isolated fabric this would fail without rail affinity. | |
| echo "=== MORI-IO internode: cross-rail GPU write (offset=5, rail affinity ON) ===" | |
| NODE2_CMD=( | |
| $CT exec -e MORI_IO_RAIL_AFFINITY=1 $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| -- --op-type write --buffer-size 4096 --transfer-batch-size 64 | |
| --iters 4 --enable-sess --enable-batch-transfer | |
| --num-qp-per-transfer 2 | |
| --num-initiator-dev 8 --num-target-dev 8 | |
| --target-dev-offset 5 | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec -e MORI_IO_RAIL_AFFINITY=1 $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| -- --op-type write --buffer-size 4096 --transfer-batch-size 64 \ | |
| --iters 4 --enable-sess --enable-batch-transfer \ | |
| --num-qp-per-transfer 2 \ | |
| --num-initiator-dev 8 --num-target-dev 8 \ | |
| --target-dev-offset 5 | |
| wait | |
| - name: MORI-EP internode normal kernel bench | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_test.sh" | |
| PORT=29140 | |
| for KERNEL in v1 v1_ll; do | |
| for TOKENS in 64 128 1024 2048 4096; do | |
| echo "=== bench kernel=$KERNEL max-tokens=$TOKENS port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| --cmd bench --kernel-type $KERNEL --max-tokens $TOKENS | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| --cmd bench --kernel-type $KERNEL --max-tokens $TOKENS | |
| wait | |
| sleep 1 | |
| PORT=$((PORT + 1)) | |
| done | |
| done | |
| - name: MORI-EP internode normal kernel stress | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_test.sh" | |
| PORT=29160 | |
| for KERNEL in v1 v1_ll; do | |
| for TOKENS in 64 128 1024 2048 4096; do | |
| echo "=== stress kernel=$KERNEL max-tokens=$TOKENS port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| --cmd stress --kernel-type $KERNEL --max-tokens $TOKENS | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec -e MORI_INTERNODE_TIMEOUT=600 $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| --cmd stress --kernel-type $KERNEL --max-tokens $TOKENS | |
| wait | |
| sleep 1 | |
| PORT=$((PORT + 1)) | |
| done | |
| done | |
| - name: MORI-EP internode async kernel test | |
| run: | | |
| SCRIPT="$GITHUB_WORKSPACE/tools/run_internode_test.sh" | |
| PORT=29180 | |
| for TOKENS in 64 128; do | |
| echo "=== async_ll test max-tokens=$TOKENS port=$PORT ===" | |
| NODE2_CMD=( | |
| $CT exec | |
| -e GPU_PER_NODE=8 -e MORI_ENABLE_SDMA=1 -e MORI_INTERNODE_TIMEOUT=600 | |
| $CONTAINER bash $SCRIPT | |
| --rank 1 --master-addr $NODE1_HOST --master-port $PORT | |
| --ifname $NODE2_IFNAME | |
| --cmd test --kernel-type async_ll | |
| --quant-type none --dtype bf16 --max-tokens $TOKENS | |
| ) | |
| ssh $SSH_OPTS -p $NODE2_PORT $(whoami)@$NODE2_HOST "${NODE2_CMD[*]}" & | |
| $CT exec \ | |
| -e GPU_PER_NODE=8 \ | |
| -e MORI_ENABLE_SDMA=1 \ | |
| -e MORI_INTERNODE_TIMEOUT=600 \ | |
| $CONTAINER bash $SCRIPT \ | |
| --rank 0 --master-addr $NODE1_HOST --master-port $PORT \ | |
| --ifname $NODE1_IFNAME \ | |
| --cmd test --kernel-type async_ll \ | |
| --quant-type none --dtype bf16 --max-tokens $TOKENS | |
| wait | |
| sleep 1 | |
| PORT=$((PORT + 1)) | |
| done | |
| - name: Cleanup node1 | |
| if: always() | |
| run: | | |
| $CT rm -f $CONTAINER || true | |
| if $CT image inspect $IMAGE &>/dev/null; then | |
| $CT run --rm -v $GITHUB_WORKSPACE:$GITHUB_WORKSPACE $IMAGE \ | |
| chown -R $(id -u):$(id -g) $GITHUB_WORKSPACE 2>/dev/null || true | |
| fi | |
| - name: Cleanup node2 | |
| if: always() | |
| run: | | |
| ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ | |
| -p $NODE2_PORT $(whoami)@$NODE2_HOST \ | |
| "$CT rm -f $CONTAINER || true" | |
| umbp-single-node-hicache: | |
| name: mori umbp single-node hicache smoke test | |
| needs: build | |
| if: github.event_name == 'workflow_dispatch' && inputs.run_umbp_single_node_hicache == 'true' | |
| runs-on: [self-hosted, MI355X-AINIC-TW] | |
| timeout-minutes: 180 | |
| steps: | |
| - name: Checkout MoRI | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: true | |
| - name: Checkout SGLang | |
| uses: actions/checkout@v4 | |
| with: | |
| repository: ${{ inputs.sglang_repo }} | |
| ref: ${{ inputs.sglang_ref }} | |
| path: sglang | |
| fetch-depth: 1 | |
| - name: Ensure hicache script is executable | |
| run: chmod +x src/umbp/scripts/run_umbp_single_node_hicache.sh | |
| - name: Run hicache smoke test | |
| env: | |
| SGLANG_REPO: ${{ github.workspace }}/sglang | |
| MORI_REPO: ${{ github.workspace }} | |
| RESULTS_DIR: ${{ github.workspace }}/umbp_single_node_results | |
| USE_DUMMY_WEIGHTS: "true" | |
| RUN_GSM8K: "false" | |
| MODEL_PATH: ${{ github.workspace }}/umbp_dummy_model | |
| ENABLE_DP: ${{ inputs.enable_dp }} | |
| START_UMBP_MASTER: "false" | |
| run: | | |
| set -euo pipefail | |
| mkdir -p "$RESULTS_DIR" | |
| ./src/umbp/scripts/run_umbp_single_node_hicache.sh | |
| - name: Upload hicache artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: umbp-single-node-hicache-${{ github.run_id }} | |
| path: | | |
| ${{ github.workspace }}/umbp_single_node_results | |
| if-no-files-found: warn | |
| - name: Cleanup hicache container | |
| if: always() | |
| run: docker rm -f umbp-single-node >/dev/null 2>&1 || true |