Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 29 additions & 110 deletions .ci/jenkins/lib/test-dl-matrix.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
---
#
# DLCluster GPU Test Matrix Configuration for dlcluster.nvidia.com
# Lyris GPU Test Matrix Configuration
#
# Key Components:
# - Job Configuration: Defines timeout, failure behavior, and server resources
# - Docker Images: Specifies the container images used for different build stages
# - Matrix Axes: Defines build variations for dlcluster testing (multi-node, multi-GPU)
# - Run Steps: Sequential steps for running dlcluster GPU tests
# - Matrix Axes: Defines build variations for Lyris testing (multi-node, multi-GPU)
# - Run Steps: Sequential steps - build image, then trigger Lyris pipeline and wait
#
# When Modified:
# - Adding/removing Docker images: Affects available test environments
Expand Down Expand Up @@ -38,23 +38,20 @@ kubernetes:

credentials:
- {credentialsId: 'svc-nixl-new-artifactory-token', usernameVariable: 'REPO_USER', passwordVariable: 'REPO_PASS'}
# TODO: create these Jenkins credentials before enabling the Lyris trigger
- {credentialsId: 'lyris-trigger-token', variable: 'LYRIS_TRIGGER_TOKEN'}
- {credentialsId: 'lyris-api-token', variable: 'LYRIS_API_TOKEN'}

env:
ARTIFACTORY_PATH: /sw-nbu-swx-nixl-docker-local/ci
NIXL_INSTALL_DIR: /opt/nixl
NIXL_BUILD_DIR: nixl_build
SLURM_NODES: 1
SLURM_PARTITION: gb200nvl72_ci
SLURM_HEAD_NODE: dlcluster.nvidia.com
SLURM_HEAD_USER: svc-nixl
SLURM_ACCOUNT: 'oberon-gb-ci'
SLURM_JOB_TIMEOUT: '01:30:00'
SLURM_IMMEDIATE_TIMEOUT: 3600
SSH_CREDENTIALS_ID: 'svc-nixl-ssh_key'
JOB_ID_FILE_ROOT: "/mnt/pvc/${JOB_BASE_NAME}"
TEST_TIMEOUT: 50
STORAGE_DRIVER: overlay
CI_IMAGE_TAG: "20260607-1"
LYRIS_PARTITION: gb200
# TODO: flip to main once precluster-poc lyris-exec is merged
LYRIS_PIPELINE_REF: feat/nixl-srun-gb200-gb300

empty_volumes:
- {mountPath: /var/lib/containers/storage, memory: false}
Expand Down Expand Up @@ -110,104 +107,26 @@ steps:
-f .ci/dockerfiles/Dockerfile.gpu-test .
podman push --creds ${REPO_USER}:${REPO_PASS} ${PR_IMAGE}

- name: Allocate DL Environment
containerSelector: "{name: 'build_helper_dl'}"
parallel: false
shell: action
module: slurmCI
run: allocation
args:
partition: "${SLURM_PARTITION}"
headNode: "${SLURM_HEAD_NODE}"
headUser: "${SLURM_HEAD_USER}"
nodes: "${SLURM_NODES}"
jobTimeout: "${SLURM_JOB_TIMEOUT}"
immediateTimeout: "${SLURM_IMMEDIATE_TIMEOUT}"
jobName: "nixl-ci-${ucx_version}-${BUILD_NUMBER}"
jobIdFile: "${JOB_ID_FILE_ROOT}/job_id_${ucx_version}_${BUILD_NUMBER}.txt"
credentialsId: "${SSH_CREDENTIALS_ID}"
extraArgs: [
"--account=${SLURM_ACCOUNT}"
]

- name: Run DL Python tests
- name: Run DL tests on Lyris
containerSelector: "{name: 'build_helper_dl'}"
timeout: "${TEST_TIMEOUT}"
parallel: false
shell: action
module: slurmCI
run: run
args:
jobIdFile: "${JOB_ID_FILE_ROOT}/job_id_${ucx_version}_${BUILD_NUMBER}.txt"
testScript: ".gitlab/test_python.sh ${NIXL_INSTALL_DIR}"
headNode: "${SLURM_HEAD_NODE}"
headUser: "${SLURM_HEAD_USER}"
dockerImage: "${registry_host}#${ARTIFACTORY_PATH}/pr/${arch}/nixl-ci-dl-gpu-test-${ucx_version}:${BUILD_NUMBER}"
credentialsId: "${SSH_CREDENTIALS_ID}"
containerName: "nixl-ci-${ucx_version}-${BUILD_NUMBER}"

- name: Run DL Rust tests
containerSelector: "{name: 'build_helper_dl'}"
timeout: "${TEST_TIMEOUT}"
parallel: false
shell: action
module: slurmCI
run: run
args:
jobIdFile: "${JOB_ID_FILE_ROOT}/job_id_${ucx_version}_${BUILD_NUMBER}.txt"
testScript: ".gitlab/test_rust.sh ${NIXL_INSTALL_DIR}"
headNode: "${SLURM_HEAD_NODE}"
headUser: "${SLURM_HEAD_USER}"
dockerImage: "${registry_host}#${ARTIFACTORY_PATH}/pr/${arch}/nixl-ci-dl-gpu-test-${ucx_version}:${BUILD_NUMBER}"
credentialsId: "${SSH_CREDENTIALS_ID}"
containerName: "nixl-ci-${ucx_version}-${BUILD_NUMBER}"

- name: Run DL CPP tests
containerSelector: "{name: 'build_helper_dl'}"
timeout: "${TEST_TIMEOUT}"
parallel: false
shell: action
module: slurmCI
run: run
args:
jobIdFile: "${JOB_ID_FILE_ROOT}/job_id_${ucx_version}_${BUILD_NUMBER}.txt"
testScript: ".gitlab/test_cpp.sh ${NIXL_INSTALL_DIR}"
headNode: "${SLURM_HEAD_NODE}"
headUser: "${SLURM_HEAD_USER}"
dockerImage: "${registry_host}#${ARTIFACTORY_PATH}/pr/${arch}/nixl-ci-dl-gpu-test-${ucx_version}:${BUILD_NUMBER}"
credentialsId: "${SSH_CREDENTIALS_ID}"
containerName: "nixl-ci-${ucx_version}-${BUILD_NUMBER}"
slurmEnv: [
'UCX_IB_REG_METHODS=rcache'
]

- name: Run DL Nixlbench tests
containerSelector: "{name: 'build_helper_dl'}"
timeout: "${TEST_TIMEOUT}"
parallel: false
shell: action
module: slurmCI
run: run
args:
jobIdFile: "${JOB_ID_FILE_ROOT}/job_id_${ucx_version}_${BUILD_NUMBER}.txt"
testScript: ".gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR}"
headNode: "${SLURM_HEAD_NODE}"
headUser: "${SLURM_HEAD_USER}"
dockerImage: "${registry_host}#${ARTIFACTORY_PATH}/pr/${arch}/nixl-ci-dl-gpu-test-${ucx_version}:${BUILD_NUMBER}"
credentialsId: "${SSH_CREDENTIALS_ID}"
containerName: "nixl-ci-${ucx_version}-${BUILD_NUMBER}"
slurmEnv: [
'HAS_GPU=false'
]

pipeline_stop:
containerSelector: "{name: 'build_helper_dl'}"
parallel: false
shell: action
module: slurmCI
run: stopAllForBuild
args:
credentialsId: "${SSH_CREDENTIALS_ID}"
headNode: "${SLURM_HEAD_NODE}"
headUser: "${SLURM_HEAD_USER}"
jobIdDir: "${JOB_ID_FILE_ROOT}"
run: |
set -x
# Write test-cmds file: one entry per line as "name<TAB>command".
# cpp: UCX_IB_REG_METHODS=rcache is inlined as a shell env-var prefix.
# nixlbench: HAS_GPU=false is inlined as a shell env-var prefix.
# NOTE: the original slurmCI steps set these via slurmEnv (per-run SLURM env injection).
# In the single-session Lyris model all four tests run in one srun; the inline
# prefix approach is equivalent but scopes the var to that command only.
printf "python\t.gitlab/test_python.sh ${NIXL_INSTALL_DIR}\n" > tc.txt
printf "rust\t.gitlab/test_rust.sh ${NIXL_INSTALL_DIR}\n" >> tc.txt
printf "cpp\tUCX_IB_REG_METHODS=rcache .gitlab/test_cpp.sh ${NIXL_INSTALL_DIR}\n" >> tc.txt
printf "nixlbench\tHAS_GPU=false .gitlab/test_nixlbench.sh ${NIXL_INSTALL_DIR}\n" >> tc.txt
TRIGGER_TOKEN=${LYRIS_TRIGGER_TOKEN} GITLAB_TOKEN=${LYRIS_API_TOKEN} \
.ci/scripts/trigger_and_wait.sh \
--ref ${LYRIS_PIPELINE_REF} \
--partition ${LYRIS_PARTITION} \
--image "${registry_host}#${ARTIFACTORY_PATH}/pr/${arch}/nixl-ci-dl-gpu-test-${ucx_version}:${BUILD_NUMBER}" \
--test-cmds-file tc.txt \
--retries 2
41 changes: 41 additions & 0 deletions .ci/scripts/trigger_and_wait.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env bash
# Trigger the generic lyris-exec pipeline, poll to terminal, fetch artifacts.
# Retries on infra-class outcomes (exit 90 / runner offline / queue timeout).
set -uo pipefail
API="https://gitlab-master.nvidia.com/api/v4/projects/231686"
REF=""; PART=""; IMAGE=""; TCF=""; RETRIES=2
while [ $# -gt 0 ]; do case "$1" in
--ref) REF=$2; shift 2;; --partition) PART=$2; shift 2;;
--image) IMAGE=$2; shift 2;; --test-cmds-file) TCF=$2; shift 2;;
--retries) RETRIES=$2; shift 2;; *) echo "bad arg $1" >&2; exit 2;; esac; done
: "${TRIGGER_TOKEN:?}"; : "${GITLAB_TOKEN:?}"
: "${REF:?}"; : "${PART:?}"; : "${IMAGE:?}"; : "${TCF:?}"
TC=$(base64 -w0 < "$TCF" 2>/dev/null || base64 < "$TCF" | tr -d '\n')

attempt=0
while :; do
attempt=$((attempt+1))
PID=$(curl -sS -X POST -F "token=$TRIGGER_TOKEN" -F "ref=$REF" \
-F "variables[PARTITION]=$PART" -F "variables[IMAGE]=$IMAGE" \
-F "variables[TEST_CMDS]=$TC" "$API/trigger/pipeline" \
| python3 -c "import sys,json;print(json.load(sys.stdin)['id'])")
echo "pipeline=$PID attempt=$attempt"
# poll
while :; do
ST=$(curl -sS -H "PRIVATE-TOKEN: $GITLAB_TOKEN" "$API/pipelines/$PID" \
| python3 -c "import sys,json;print(json.load(sys.stdin)['status'])")
case "$ST" in success|failed|canceled|skipped) break;; esac
sleep 30
done
JID=$(curl -sS -H "PRIVATE-TOKEN: $GITLAB_TOKEN" "$API/pipelines/$PID/jobs" \
| python3 -c "import sys,json;[print(j['id']) for j in json.load(sys.stdin) if j['name']=='lyris-exec']" | head -1)
rm -rf lyris-artifacts && mkdir -p lyris-artifacts
curl -sS -H "PRIVATE-TOKEN: $GITLAB_TOKEN" \
"$API/jobs/$JID/artifacts/results/summary.txt" -o lyris-artifacts/summary.txt || true
if grep -q '^INFRA_FAILURE' lyris-artifacts/summary.txt 2>/dev/null || [ "$ST" = "canceled" ]; then
if [ "$attempt" -le "$RETRIES" ]; then echo "infra failure, retrying"; continue; fi
echo "infra failure after retries"; exit 90
fi
fails=$(awk '/^TEST_FAILS=/{split($0,a,"=");print a[2]}' lyris-artifacts/summary.txt)
[ "${fails:-1}" = "0" ] && exit 0 || exit 1
done
Loading