doublewordai · fergusfinn · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/p2p/Makefile b/p2p/Makefile
@@ -1,19 +1,27 @@
 # Makefile for UCCL P2P Engine nanobind project (stable ABI)
 #
 # Single binary, runtime transport selection via UCCL_P2P_TRANSPORT env var.
-# libibverbs, libnccl, and libefa are loaded dynamically at runtime via
-# dlopen/dlsym (see *_dl.h). No transport libraries are linked.
+# libibverbs, libnccl, libefa, and optional libfabric/CXI are loaded
+# dynamically at runtime via dlopen/dlsym (see *_dl.h).
 #
-# Runtime: UCCL_P2P_TRANSPORT=rdma|nccl|efa (default: rdma)
+# Runtime: UCCL_P2P_TRANSPORT=rdma|nccl|efa|cxi (default: rdma)
 
 # DietGPU optional integration
 USE_DIETGPU ?= $(shell echo $${USE_DIETGPU:-0})
+USE_CXI ?= $(shell echo $${USE_CXI:-0})
 
 # Compiler and flags
 CUDA_HOME ?= /usr/local/cuda
 CUDA_INC  := $(CUDA_HOME)/include
 CUDA_LIB  := $(CUDA_HOME)/lib64
 EFA_HOME ?= /opt/amazon/efa
+LIBFABRIC_HOME ?= /opt/libfabric
+
+CXI_CXXFLAGS :=
+CXI_LDFLAGS :=
+ifeq ($(USE_CXI),1)
+	CXI_CXXFLAGS := -DUSE_CXI -I./cxi -I$(LIBFABRIC_HOME)/include
+endif
 
 # Building with the following settings:
 # NOTE: -libverbs, -lnccl, -lefa are NOT linked at compile time.
@@ -27,6 +35,7 @@ CXXFLAGS := -O3 -shared -std=c++17 -fPIC \
 	-MMD -MP \
 	-Wno-pointer-arith -Wno-sign-compare -Wno-unused-variable \
 	-Wl,-rpath=/usr/lib/x86_64-linux-gnu
+CXXFLAGS += $(CXI_CXXFLAGS)
 
 # ---------------- DietGPU ----------------
 DIETGPU_ROOT      := ../thirdparty/dietgpu
@@ -75,9 +84,9 @@ PREFIX ?= /usr/local
 LIBDIR ?= $(PREFIX)/lib
 INCDIR ?= $(PREFIX)/include
 
-# NO transport libraries linked (-libverbs, -lnccl, -lefa).
-# All ibv_* / nccl* / efadv_* symbols are provided by the dlsym wrappers
-# in rdma/ibverbs_dl.cc, nccl/nccl_dl.cc, rdma/efadv_dl.cc.
+# No transport libraries are linked (-libverbs, -lnccl, -lefa, -lfabric).
+# All ibv_* / nccl* / efadv_* / fi_* symbols are provided by dlsym wrappers in
+# rdma/ibverbs_dl.cc, nccl/nccl_dl.cc, rdma/efadv_dl.cc, and cxi/fabric_dl.cc.
 # The --wrap flags redirect compat-layer calls from verbs.h to our wrappers.
 IBV_WRAP_FLAGS := -Wl,--wrap=ibv_get_device_list \
                   -Wl,--wrap=ibv_query_port \
@@ -88,7 +97,7 @@ IBV_WRAP_FLAGS := -Wl,--wrap=ibv_get_device_list \
                   -Wl,--wrap=ibv_qp_to_qp_ex
 LDFLAGS = -L$(CUDA_LIB) -lcudart -lcuda -Wl,-rpath,$(CUDA_LIB) \
           -lz -lelf -lpthread -ldl -Wl,-Bsymbolic-functions \
-          $(IBV_WRAP_FLAGS) $(DIETGPU_LIBS)
+          $(IBV_WRAP_FLAGS) $(DIETGPU_LIBS) $(CXI_LDFLAGS)
 
 # Target and source files - always compile ALL sources
 P2P_PYTHON_EXT := p2p$(PYEXT)
@@ -115,6 +124,9 @@ P2P_IMPL_SOURCES := \
 	util/seq_num.cc \
 	rdma/providers/efa_data_channel_impl.cc \
 	rdma/providers/ib_data_channel_impl.cc
+ifeq ($(USE_CXI),1)
+	P2P_IMPL_SOURCES += cxi/cxi_endpoint.cc cxi/fabric_dl.cc
+endif
 P2P_IMPL_OBJECTS := $(P2P_IMPL_SOURCES:.cc=.o)
 SOURCES := engine.cc engine_api.cc nccl/nccl_endpoint.cc $(P2P_IMPL_SOURCES)
 CORE_OBJECT := engine.o nccl/nccl_endpoint.o $(P2P_IMPL_OBJECTS) $(DL_OBJECTS)

diff --git a/p2p/README.md b/p2p/README.md
@@ -64,6 +64,12 @@ To enable AWS EFA support, you can do the same as above, and specify `UCCL_P2P_T
 
 To enable GCP TCPX support, you can refer to [NIXL_plugin_readme.md](./NIXL_plugin_readme.md).
 
+To enable HPE Slingshot/CXI support, build with libfabric headers and select the `cxi` transport at runtime:
+```bash
+USE_CXI=1 LIBFABRIC_HOME=/path/to/libfabric make -j install
+UCCL_P2P_TRANSPORT=cxi UCCL_P2P_DISABLE_IPC=1 torchrun ...
+```
+
 To build with DietGPU float compression support, you can:
 ```bash
 USE_DIETGPU=1 make -j install
@@ -92,7 +98,7 @@ torchrun --nnodes=2 --nproc_per_node=1 --node-rank=1 --master_addr=<IP addr> ben
 Notes: 
 * You may consider exporting `GLOO_SOCKET_IFNAME=xxx NCCL_SOCKET_IFNAME=xxx` if triggering Gloo connectFullMesh failure.
 * You may consider exporting `UCCL_P2P_RDMA_GID_INDEX` if your cluster requires it for NCCL to run (usually 1, or 3 in some testbed).
-* You can specify `UCCL_P2P_TRANSPORT=ib|efa|nccl|tcp|tcpx` at runtime to choose different network backends. The default is `ib` that works for NVIDIA, Broadcom, AMD, and Intel RDMA NICs. 
+* You can specify `UCCL_P2P_TRANSPORT=ib|efa|nccl|tcp|tcpx|cxi` at runtime to choose different network backends. The default is `ib` that works for NVIDIA, Broadcom, AMD, and Intel RDMA NICs.
 * **You must first import `torch` before importing `uccl.p2p` for AMD GPUs**, otherwise, `RuntimeError: No HIP GPUs are available` will occur. We guess this is because torch does some extra init for AMD GPUs, in order for Pybind-C++ code to work. 
 * One-sided network write is the default in `benchmark_uccl.py`; use `--mode read` for RDMA read.
 * To benchmark one-sided IPC write (GPU-to-GPU or CPU-to-GPU), `torchrun --nproc_per_node=2 benchmarks/benchmark_uccl.py --write-ipc`. Use `--device cpu --pinned` for CPU source buffers.
@@ -107,7 +113,15 @@ Notes:
 | UCCL_P2P_RDMA_SL | Service level in RDMA network | 8/3 (EFA/IB) |
 | UCCL_P2P_RDMA_TC | Traffic class in RDMA network | 104 (IB) |
 | UCCL_P2P_RDMA_DEV | RDMA devices forced to use (instead of auto-selecting based on PCIe affinity) | none (eg, `irdma-mkp0,irdma-mkp1`) |
-| UCCL_P2P_TRANSPORT | Network backend to use at runtime | ib (others: efa/nccl/tcp/tcpx) |
+| UCCL_P2P_TRANSPORT | Network backend to use at runtime | ib (others: efa/nccl/tcp/tcpx/cxi) |
+| UCCL_CXI_DOMAIN | CXI/libfabric domain to use when `UCCL_P2P_TRANSPORT=cxi` | auto from GPU index, eg `cxi0` |
+| UCCL_CXI_DEVICE_INDEX | CXI device index used for automatic domain selection | GPU index modulo 4 |
+| UCCL_CXI_THREADING | libfabric threading hint for the CXI domain | endpoint |
+| UCCL_CXI_TX_QUEUE_SIZE | CXI transmit queue size | 4096 |
+| UCCL_CXI_RX_QUEUE_SIZE | CXI receive queue size | 4096 |
+| UCCL_CXI_CQ_SIZE | CXI completion queue size | 8192 |
+| UCCL_P2P_MAX_INFLIGHT_OPS | Maximum one-sided in-flight operations; CXI defaults lower than RDMA | 32 for CXI, otherwise internal maximum |
+| UCCL_LIBFABRIC_SO | Override libfabric shared-library path for the CXI dlsym wrapper | auto-detect `libfabric.so` / `libfabric.so.1` |
 | UCCL_P2P_COMPRESS_STRATEGY | DietGPU compression strategy (requires `USE_DIETGPU=1` build) | none |
 | UCCL_RDMA_ADAPTIVE_SLEEP | Enable adaptive sleeping on proxy threads, by putting the proxy threads into a sleeping state if there have been no new work requests / RDMA completion events after 120s. | null |