Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/uccl-build-test-gb10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,12 @@ jobs:

./build.sh cu13 ep 3.13 --install 2>&1 | tee build.log

pushd /tmp
if ! python -c 'import torch; import uccl.ep'; then
echo 'Import of torch and uccl.ep failed.'
exit 1
fi
popd

echo 'Build and Verification Successful!'
EOF
Expand Down
5 changes: 4 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
include uccl/__init__.py
include uccl/lib/*.so
include uccl/*.so
include uccl/*.so
recursive-include ep/python/uccl_ep *.py *.so
recursive-include ep/deep_ep_wrapper/deep_ep *.py
include build_native.sh
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@ git clone https://github.com/uccl-project/uccl.git && cd uccl
# Eg, bash build.sh cu12 ep --install
bash build.sh [cu12|cu13|roc7|roc6|therock] [all|ccl_rdma|ccl_efa|p2p|ep] \
[py_version] [rocm_index_url] --install

# Install from source. Use the torch package from your current environment so
# CUDA/ROCm backend detection matches the target platform.
pip install nanobind
pip install . --no-build-isolation
# or (legacy, may be removed in newer setuptools):
python3 setup.py install
# or development, install in editable mode instead:
pip install -e . --no-build-isolation
```
> Note:
> - By default, `build.sh cu12` targets CUDA 12.8 and `build.sh roc7` targets ROCm 7.1, but you can also specify `cu13|roc6` to target CUDA 13.0 or ROCm 6.4.
Expand Down
2 changes: 2 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -343,12 +343,14 @@ if [[ "${SKIP_DOCKER_BUILD:-0}" != "1" ]]; then

if [[ "$ARCH" == "aarch64" ]]; then
${CONTAINER_ENGINE} build \
--network=host \
--platform=linux/arm64 \
$BUILD_ARGS \
-t "$IMAGE_NAME" \
-f "$DOCKERFILE" .
else
${CONTAINER_ENGINE} build \
--network=host \
$BUILD_ARGS \
-t "$IMAGE_NAME" \
-f "$DOCKERFILE" .
Expand Down
266 changes: 13 additions & 253 deletions build_inner.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
# Invoked by build.sh via docker/podman/apptainer; not intended for direct
# execution on the host.
#
# Responsibilities (packaging only):
# * Drive ``python -m build`` to invoke setup.py -> ShellBuildExtension,
# which delegates the actual native compilation to ``build_native.sh``.
# * Repair / retag / rename the resulting wheel via auditwheel.
#
# Native compilation lives in ``build_native.sh``; this script does not call
# ``make`` directly.
#
# Environment variables consumed (set by build.sh before container launch):
#
# Required:
Expand All @@ -21,7 +29,7 @@
# UCCL_RETAG_TO_HOST_GLIBC Retag wheel to host glibc version (default "0")
# UCCL_LOCAL_VERSION Local version suffix appended to wheel filename (PEP 440)
#
# Build feature flags:
# Build feature flags (forwarded to build_native.sh):
# USE_DIETGPU Enable DietGPU compression (default "0")
# USE_INTEL_RDMA_NIC Enable Intel RDMA NIC / irdma driver (default "0")
# USE_DMABUF Enable EP DMA-BUF GPU memory registration (default "0")
Expand All @@ -31,228 +39,6 @@

set -euo pipefail

########################################################
# Build helper functions
########################################################

# Rename cpython-versioned .so files to .abi3.so for stable ABI compatibility.
# Only applies on Python >= 3.12 where nanobind stable ABI is enabled.
rename_to_abi3() {
local dir="$1"
local py_stable_abi_ok
py_stable_abi_ok=$(python3 -c "import sys; print(1 if sys.version_info >= (3, 12) else 0)")
if [[ "$py_stable_abi_ok" != "1" ]]; then
echo "Python < 3.12 detected, skipping abi3 rename (nanobind stable ABI not supported)"
return
fi
for f in "$dir"/*.cpython-*.so; do
if [[ -f "$f" ]]; then
local newname
newname=$(echo "$f" | sed 's/\.cpython-[^.]*-[^.]*-[^.]*\.so/.abi3.so/')
echo "Renaming $(basename "$f") -> $(basename "$newname")"
mv "$f" "$newname"
fi
done
}

build_rccl_nccl_header() {
# Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
cd thirdparty/rccl
# Just to get nccl.h, not the whole library
CXX=/opt/rocm/bin/hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF >/dev/null 2>&1 || true
cd ../..
fi
}

build_ccl_rdma() {
local TARGET="$1"
local ARCH="$2"
local IS_EFA="$3"

set -euo pipefail
echo "[container] build_ccl_rdma Target: $TARGET"

if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
fi

if [[ "$TARGET" == cu* ]]; then
cd collective/rdma && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../
TARGET_SO=collective/rdma/libnccl-net-uccl.so
elif [[ "$TARGET" == roc[67] ]]; then
if [[ "$ARCH" == "aarch64" ]]; then
echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
return
fi
cd collective/rdma && make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm && cd ../../
TARGET_SO=collective/rdma/librccl-net-uccl.so
elif [[ "$TARGET" == "therock" ]]; then
if [[ "$ARCH" == "aarch64" ]]; then
echo "Skipping ROCm build on Arm64 (no ROCm toolchain)."
return
fi
# Unlike CUDA, ROCM does not include nccl.h. So we need to build rccl to get nccl.h.
if [[ ! -f "thirdparty/rccl/build/release/include/nccl.h" ]]; then
cd thirdparty/rccl
# Just to get nccl.h, not the whole library
CXX=hipcc cmake -B build/release -S . -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF -DCMAKE_PREFIX_PATH=$(rocm-sdk path --cmake) -DROCM_PATH=$(rocm-sdk path --root) -DHIP_PLATFORM=amd >/dev/null 2>&1 || true
cd ../..
fi
cd collective/rdma && make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib && cd ../../
TARGET_SO=collective/rdma/librccl-net-uccl.so
fi

echo "[container] Copying RDMA .so to uccl/lib/"
mkdir -p uccl/lib
cp ${TARGET_SO} uccl/lib/
}

build_ccl_efa() {
local TARGET="$1"
local ARCH="$2"
local IS_EFA="$3"

set -euo pipefail
echo "[container] build_ccl_efa Target: $TARGET"

if [[ "$ARCH" == "aarch64" || "$TARGET" == roc[67] || "$TARGET" == "therock" ]]; then
echo "Skipping EFA build on Arm64 (no EFA installer) or ROCm (no CUDA)."
return
fi

if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
echo "[container] Building with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
fi

cd collective/efa && make clean && make -j$(nproc) USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} && cd ../../

# EFA requires a custom NCCL.
cd thirdparty/nccl-sg
make src.build -j$(nproc) NVCC_GENCODE="-gencode=arch=compute_90,code=sm_90" USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0}
cd ../..

echo "[container] Copying EFA .so to uccl/lib/"
mkdir -p uccl/lib
cp collective/efa/libnccl-net-efa.so uccl/lib/
cp thirdparty/nccl-sg/build/lib/libnccl.so uccl/lib/libnccl-efa.so
}

build_p2p() {
local TARGET="$1"
local ARCH="$2"
local IS_EFA="$3"

set -euo pipefail
echo "[container] build_p2p Target: $TARGET"

if [[ "${USE_DIETGPU:-0}" == "1" ]]; then
cd thirdparty/dietgpu
if [[ "$TARGET" == cu* ]]; then
cd dietgpu/float
CUDA_GPU_ARCH="sm_$(echo "${TORCH_CUDA_ARCH_LIST:-9.0}" | awk '{print $1}' | sed 's/+PTX//; s/\.//')"
echo "Building dietgpu float for CUDA: $CUDA_GPU_ARCH"
make clean -f Makefile.cuda && make -j$(nproc) -f Makefile.cuda GPU_ARCH=$CUDA_GPU_ARCH
else
rm -rf build/
python3 setup.py build
cd dietgpu/float
echo $TORCH_CUDA_ARCH_LIST
make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm GPU_ARCH=$TORCH_CUDA_ARCH_LIST
fi
cd ../../../..
cp thirdparty/dietgpu/dietgpu/float/libdietgpu_float.so uccl/lib
fi

cd p2p
if [[ "$TARGET" == cu* ]]; then
make clean && make -j$(nproc)
elif [[ "$TARGET" == roc[67] ]]; then
make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
elif [[ "$TARGET" == "therock" ]]; then
make clean -f Makefile.therock && make -j$(nproc) -f Makefile.therock HIP_HOME=$(rocm-sdk path --root) CONDA_LIB_HOME=$VIRTUAL_ENV/lib
fi
cd ..

echo "[container] Copying P2P .so, collective.py and utils.py to uccl/"
mkdir -p uccl
mkdir -p uccl/lib
cp p2p/libuccl_p2p.so uccl/lib/
cp p2p/p2p.*.so uccl/
cp p2p/collective.py uccl/
cp p2p/utils.py uccl/
rename_to_abi3 uccl
}

build_ep() {
local TARGET="$1"
local ARCH="$2"
local IS_EFA="$3"

set -euo pipefail
echo "[container] build_ep Target: $TARGET"

if [[ "${USE_INTEL_RDMA_NIC:-0}" == "1" ]]; then
echo "[container] Building EP with Intel RDMA NIC support (USE_INTEL_RDMA_NIC=1)"
fi
if [[ "${USE_DMABUF:-0}" == "1" ]]; then
echo "[container] Building EP with DMA-BUF GPU memory registration (USE_DMABUF=1)"
fi

if [[ "$TARGET" == "roc6" ]]; then
echo "ERROR: EP requires roc7 (ROCm 7) for HIP code transformation; roc6 is not supported." >&2
exit 1
elif [[ "$TARGET" == roc[67] || "$TARGET" == cu* || "$TARGET" == "therock" ]]; then
cd ep
# This may be needed if you traverse through different git commits
# make clean && rm -r build || true
extra_env=()
if [[ "$TARGET" == "therock" ]]; then
# On TheRock, ROCm comes from a pip-installed rocm-sdk wheel; expose its
# root to ep/setup.py via HIP_HOME/ROCM_HOME so hipcc can find headers
# and libraries. The IBGDA (GPU-driven RDMA) code path in
# ep/src/internode_ll.cu is already gated by __HIP_PLATFORM_AMD__ guards,
# so no extra flag is needed to keep the AMD build clean.
ROCM_ROOT="$(rocm-sdk path --root)"
extra_env+=(
"HIP_HOME=${ROCM_ROOT}"
"ROCM_HOME=${ROCM_ROOT}"
"ROCM_PATH=${ROCM_ROOT}"
)
fi
env "${extra_env[@]}" \
USE_INTEL_RDMA_NIC=${USE_INTEL_RDMA_NIC:-0} \
USE_DMABUF=${USE_DMABUF:-0} \
python3 setup.py build
cd ..
echo "[container] Copying EP .so to uccl/"
mkdir -p uccl/lib
cp ep/build/**/*.so uccl/
fi
rename_to_abi3 uccl
}

build_ukernel() {
local TARGET="$1"
local ARCH="$2"
local IS_EFA="$3"

set -euo pipefail
echo "[container] build_ukernel Target: $TARGET"

cd experimental/ukernel
if [[ "$TARGET" == cu* ]]; then
make clean -f Makefile && make -j$(nproc) -f Makefile
elif [[ "$TARGET" == roc[67] ]]; then
make clean -f Makefile.rocm && make -j$(nproc) -f Makefile.rocm
fi
cd ../..

echo "[container] Copying ukernel .so to uccl/"
mkdir -p uccl/lib # mkdir anyway
cp experimental/ukernel/*ukernel*.so uccl/lib
}

########################################################
# Main build logic
########################################################
Expand All @@ -273,33 +59,6 @@ if [[ "$TARGET" == "therock" ]]; then
pip3 install --no-cache-dir rocm[libraries,devel] torch --index-url ${ROCM_IDX_URL}
fi

if [[ "$TARGET" == roc[67] ]]; then
build_rccl_nccl_header
fi

if [[ "$BUILD_TYPE" == "ccl_rdma" ]]; then
build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA"
elif [[ "$BUILD_TYPE" == "ccl_efa" ]]; then
build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA"
elif [[ "$BUILD_TYPE" == "p2p" ]]; then
build_p2p "$TARGET" "$ARCH" "$IS_EFA"
elif [[ "$BUILD_TYPE" == "ep" ]]; then
build_ep "$TARGET" "$ARCH" "$IS_EFA"
elif [[ "$BUILD_TYPE" == "p2p_ep" ]]; then
build_p2p "$TARGET" "$ARCH" "$IS_EFA"
build_ep "$TARGET" "$ARCH" "$IS_EFA"
elif [[ "$BUILD_TYPE" == "ukernel" ]]; then
build_ukernel "$TARGET" "$ARCH" "$IS_EFA"
elif [[ "$BUILD_TYPE" == "all" ]]; then
if [[ -n "$IS_EFA" ]]; then
build_ccl_efa "$TARGET" "$ARCH" "$IS_EFA"
else
build_ccl_rdma "$TARGET" "$ARCH" "$IS_EFA"
fi
build_p2p "$TARGET" "$ARCH" "$IS_EFA"
build_ep "$TARGET" "$ARCH" "$IS_EFA"
fi

if [[ "$TARGET" == "therock" ]]; then
echo "
def initialize():
Expand Down Expand Up @@ -328,10 +87,11 @@ def initialize():
export PIP_EXTRA_INDEX_URL=${ROCM_IDX_URL}
fi

ls -lh uccl/
ls -lh uccl/lib/
python3 -m build
# All native build logic lives in ``build_native.sh``, driven by setup.py's
# ShellBuildExtension. ``--no-isolation`` reuses the container's setuptools/wheel.
python3 -m build --wheel --no-isolation

# Restore the original setup.py if we patched it.
if [[ "$TARGET" == "therock" ]]; then
mv ${BACKUP_FN} setup.py
fi
Expand Down
Loading
Loading