3232jobs :
3333 build-ubuntu :
3434 runs-on : ubuntu-24.04
35- strategy :
36- matrix :
37- include :
38- - gfx_target : gfx1151
39- s3_target : gfx1151
40- gpu_targets : gfx1151
41- - gfx_target : gfx1150
42- s3_target : gfx1150
43- gpu_targets : gfx1150
44- - gfx_target : gfx1153
45- s3_target : gfx1153
46- gpu_targets : gfx1153
47- # Hawk Point / Phoenix family (Radeon 760M/780M = gfx1103) ships only
48- # in the gfx110X-all bundle, which also covers desktop RDNA3
49- # (RX 7900/7800/7600). Build+release only — no on-hardware test runner.
50- - gfx_target : gfx110X
51- s3_target : gfx110X-all
52- gpu_targets : gfx1100;gfx1101;gfx1102;gfx1103
53- fail-fast : false
35+ # Single multiarch build: one fat binary covering all current CI arches,
36+ # sourced from TheRock's multiarch tarball (arch-neutral host + per-arch
37+ # Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M),
38+ # gfx1150/1151/1153 (RDNA3.5 Strix APUs).
39+ env :
40+ GPU_TARGETS : gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1153
5441 outputs :
5542 rocm_version : ${{ steps.set-outputs.outputs.rocm_version }}
5643 llamacpp_commit_hash : ${{ steps.set-outputs.outputs.llamacpp_commit_hash }}
@@ -84,16 +71,18 @@ jobs:
8471 ninja --version
8572 echo "Build dependencies installation completed"
8673
87- - name : Download and extract ROCm directly to /opt/rocm
74+ - name : Download and extract multiarch ROCm directly to /opt/rocm
8875 run : |
8976 rocm_version="${{ env.ROCM_VERSION }}"
90- s3_target="${{ matrix.s3_target }} "
77+ base_url="https://rocm.nightlies.amd.com/tarball-multi-arch "
9178
9279 if [ "$rocm_version" = "latest" ]; then
93- echo "Auto-detecting latest ROCm version for target: $s3_target"
94- s3_response=$(curl -s "https://therock-nightly-tarball.s3.amazonaws.com/?prefix=therock-dist-linux-${s3_target}-7")
95-
96- files=$(echo "$s3_response" | grep -oP '(?<=<Key>)[^<]*' | grep "therock-dist-linux-${s3_target}-")
80+ echo "Auto-detecting latest multiarch ROCm version"
81+ # The multiarch host serves an HTML index (not S3 XML); scrape the
82+ # multiarch tarball names from it.
83+ files=$(curl -s "$base_url/" \
84+ | grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \
85+ | sort -u)
9786
9887 latest_file=""
9988 latest_major=0
10392 latest_is_alpha=false
10493
10594 while IFS= read -r file; do
106- if [[ "$file" =~ therock-dist-linux-${s3_target}-.*? ([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
95+ if [[ "$file" =~ therock-dist-linux-multiarch- ([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
10796 version="${BASH_REMATCH[1]}"
10897 major=$(echo "$version" | cut -d. -f1)
10998 minor=$(echo "$version" | cut -d. -f2)
@@ -142,25 +131,47 @@ jobs:
142131
143132 echo "Found latest file: $latest_file"
144133
145- if [[ "$latest_file" =~ therock-dist-linux-${s3_target}-.*? ([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
134+ if [[ "$latest_file" =~ therock-dist-linux-multiarch- ([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
146135 rocm_version="${BASH_REMATCH[1]}"
147136 echo "Detected latest ROCm version: $rocm_version"
148137 else
149138 echo "Failed to extract ROCm version from latest file: $latest_file"
150- echo "Expected pattern: therock-dist-linux-${s3_target}-* <version>.tar.gz"
139+ echo "Expected pattern: therock-dist-linux-multiarch- <version>.tar.gz"
151140 exit 1
152141 fi
153-
154- rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/$latest_file"
155- else
156- rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-${s3_target}-${rocm_version}.tar.gz"
157142 fi
158143
144+ rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz"
159145 echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV
160146
161- echo "Streaming ROCm from: $rocm_url directly to extraction"
147+ # The multiarch tarball (~11.5 GB) ships device code for ALL 26 GPU
148+ # arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs.
149+ # This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and
150+ # uses the GEMM (Tensile) path, which works without .kpack files. So we
151+ # stream-extract and prune at the tar level: drop ALL .kpack, and drop the
152+ # Tensile DBs of every arch not in our target set. This keeps the runner
153+ # disk footprint small (the 11.5 GB is streamed, never stored) and yields
154+ # a lean multiarch package. tar matches --exclude on pre-strip member
155+ # names, hence the leading "./".
156+ drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \
157+ gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \
158+ gfx1152 gfx1200 gfx1201"
159+ excludes=(--exclude='./.kpack' --exclude='./.kpack/*')
160+ for a in $drop_arches; do
161+ excludes+=("--exclude=./lib/*/library/${a}")
162+ excludes+=("--exclude=./lib/*/library/${a}/*")
163+ excludes+=("--exclude=./lib/*/library/*${a}*")
164+ done
165+
166+ echo "Streaming multiarch ROCm from: $rocm_url (pruning .kpack + non-target arches)"
162167 sudo mkdir -p /opt/rocm
163- curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - -C /opt/rocm --strip-components=1
168+ curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \
169+ -C /opt/rocm --strip-components=1 "${excludes[@]}"
170+
171+ echo "Retained rocBLAS Tensile arch dirs:"
172+ ls /opt/rocm/lib/rocblas/library/ 2>/dev/null || echo "(none)"
173+ echo "Retained hipBLASLt Tensile arch dirs:"
174+ ls /opt/rocm/lib/hipblaslt/library/ 2>/dev/null || echo "(none)"
164175
165176 - name : Set ROCm environment variables
166177 run : |
@@ -189,9 +200,8 @@ jobs:
189200
190201 - name : Build Llama.cpp + ROCm
191202 run : |
192- current_target="${{ matrix.gfx_target }}"
193- gpu_targets="${{ matrix.gpu_targets }}"
194- echo "Building for target: $current_target (GPU_TARGETS=$gpu_targets)"
203+ gpu_targets="${{ env.GPU_TARGETS }}"
204+ echo "Building multiarch binary (GPU_TARGETS=$gpu_targets)"
195205
196206 mkdir build
197207 cd build
@@ -300,7 +310,7 @@ jobs:
300310 - name : Upload build artifacts
301311 uses : actions/upload-artifact@v4
302312 with :
303- name : llama-ubuntu-rocm-${{ matrix.gfx_target }} -x64
313+ name : llama-ubuntu-rocm-multiarch -x64
304314 path : build/bin/
305315 retention-days : 30
306316
@@ -316,19 +326,10 @@ jobs:
316326 test-gfx :
317327 needs : build-ubuntu
318328 if : needs.build-ubuntu.result == 'success'
319- runs-on : ${{ matrix.runner }}
320- strategy :
321- matrix :
322- include :
323- - gfx_target : gfx1151
324- runner : linux-gfx1151-gpu-rocm
325- # gfx1150 test temporarily disabled: the linux-gfx1150-gpu-rocm
326- # runner (Bangalore box) hangs in llama-cli GPU inference while the
327- # identical artifact/command passes on gfx1151 in seconds. Re-enable
328- # once the runner's GPU/driver issue is resolved.
329- # - gfx_target: gfx1150
330- # runner: linux-gfx1150-gpu-rocm
331- fail-fast : false
329+ # Single hardware test of the multiarch artifact on gfx1151. This is the
330+ # end-to-end safety net for the Tensile-only multiarch package: a real
331+ # llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device.
332+ runs-on : linux-gfx1151-gpu-rocm
332333
333334 steps :
334335 - name : Checkout repository
@@ -337,7 +338,7 @@ jobs:
337338 - name : Download build artifacts
338339 uses : actions/download-artifact@v4
339340 with :
340- name : llama-ubuntu-rocm-${{ matrix.gfx_target }} -x64
341+ name : llama-ubuntu-rocm-multiarch -x64
341342 path : llama-binaries
342343
343344 - name : Download test model
@@ -380,7 +381,7 @@ jobs:
380381 # Use a prompt with a single correct answer and greedy decoding
381382 # (--temp 0) so the result is deterministic and verifiable.
382383 prompt="What is 2 + 2? Reply with only the number."
383- echo "Running llama-cli test for ${{ matrix.gfx_target }} ..."
384+ echo "Running llama-cli test for gfx1151 (multiarch artifact) ..."
384385 echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v"
385386
386387 # Bound the run: a healthy 0.6B inference finishes in seconds. If the
@@ -469,12 +470,11 @@ jobs:
469470 contents : write
470471 # Publish only on the nightly dispatch (external cron passes
471472 # -f create_release=true). Push/PR and manual runs never release.
472- # Require the build to succeed and tests to pass-or-skip (gfx1150 test is
473- # currently skipped; its build artifact is still published).
473+ # Require the build to succeed and the gfx1151 hardware test to pass.
474474 if : |
475475 always() &&
476476 needs.build-ubuntu.result == 'success' &&
477- ( needs.test-gfx.result == 'success' || needs.test-gfx.result == 'skipped') &&
477+ needs.test-gfx.result == 'success' &&
478478 github.event_name == 'workflow_dispatch' &&
479479 github.event.inputs.create_release == 'true'
480480 steps :
@@ -505,21 +505,20 @@ jobs:
505505 fi
506506 echo "Release tag: $TAG"
507507
508- - name : Create per-target archives
508+ - name : Create multiarch archive
509509 if : steps.generate-tag.outputs.tag_exists == 'false'
510510 run : |
511511 TAG="${{ steps.generate-tag.outputs.tag }}"
512512 root="$PWD"
513- for target in gfx1151 gfx1150 gfx1153 gfx110X; do
514- artifact_dir="./all-artifacts/llama-ubuntu-rocm-${target}-x64"
515- archive="llama-${TAG}-ubuntu-rocm-${target}-x64"
516- if [ -d "$artifact_dir" ]; then
517- echo "Creating ${archive}.tar.gz"
518- tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" .
519- else
520- echo "Warning: artifact dir not found: $artifact_dir"
521- fi
522- done
513+ artifact_dir="./all-artifacts/llama-ubuntu-rocm-multiarch-x64"
514+ archive="llama-${TAG}-ubuntu-rocm-multiarch-x64"
515+ if [ -d "$artifact_dir" ]; then
516+ echo "Creating ${archive}.tar.gz"
517+ tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" .
518+ else
519+ echo "ERROR: artifact dir not found: $artifact_dir"
520+ exit 1
521+ fi
523522 ls -la *.tar.gz
524523
525524 - name : Create GitHub Release
@@ -535,10 +534,10 @@ jobs:
535534 --title "$TAG" \
536535 --notes "**Build**: $TAG
537536 **OS**: ubuntu
538- **GPU Target(s)**: gfx1151, gfx1150, gfx1153, gfx110X (gfx1100/1101/1102/1103 )
539- **ROCm Version**: $ROCM_VERSION
537+ **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary )
538+ **ROCm Version**: $ROCM_VERSION (multiarch)
540539 **Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH
541540 **Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC')
542541
543- Prebuilt llama.cpp ROCm binaries for the RDNA3.5 gfx115x APUs (gfx1151/gfx1150/gfx1153) and the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M), with ROCm runtime libraries bundled." \
542+ Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multiarch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled) ." \
544543 *.tar.gz
0 commit comments