Skip to content

Commit a18e333

Browse files
authored
Merge branch 'gfx11' into annier.mmq-device-table
2 parents e109b83 + b5e799d commit a18e333

1 file changed

Lines changed: 69 additions & 70 deletions

File tree

.github/workflows/build-gfx11-rocm.yml

Lines changed: 69 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -32,25 +32,12 @@ env:
3232
jobs:
3333
build-ubuntu:
3434
runs-on: ubuntu-24.04
35-
strategy:
36-
matrix:
37-
include:
38-
- gfx_target: gfx1151
39-
s3_target: gfx1151
40-
gpu_targets: gfx1151
41-
- gfx_target: gfx1150
42-
s3_target: gfx1150
43-
gpu_targets: gfx1150
44-
- gfx_target: gfx1153
45-
s3_target: gfx1153
46-
gpu_targets: gfx1153
47-
# Hawk Point / Phoenix family (Radeon 760M/780M = gfx1103) ships only
48-
# in the gfx110X-all bundle, which also covers desktop RDNA3
49-
# (RX 7900/7800/7600). Build+release only — no on-hardware test runner.
50-
- gfx_target: gfx110X
51-
s3_target: gfx110X-all
52-
gpu_targets: gfx1100;gfx1101;gfx1102;gfx1103
53-
fail-fast: false
35+
# Single multiarch build: one fat binary covering all current CI arches,
36+
# sourced from TheRock's multiarch tarball (arch-neutral host + per-arch
37+
# Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M),
38+
# gfx1150/1151/1153 (RDNA3.5 Strix APUs).
39+
env:
40+
GPU_TARGETS: gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1153
5441
outputs:
5542
rocm_version: ${{ steps.set-outputs.outputs.rocm_version }}
5643
llamacpp_commit_hash: ${{ steps.set-outputs.outputs.llamacpp_commit_hash }}
@@ -84,16 +71,18 @@ jobs:
8471
ninja --version
8572
echo "Build dependencies installation completed"
8673
87-
- name: Download and extract ROCm directly to /opt/rocm
74+
- name: Download and extract multiarch ROCm directly to /opt/rocm
8875
run: |
8976
rocm_version="${{ env.ROCM_VERSION }}"
90-
s3_target="${{ matrix.s3_target }}"
77+
base_url="https://rocm.nightlies.amd.com/tarball-multi-arch"
9178
9279
if [ "$rocm_version" = "latest" ]; then
93-
echo "Auto-detecting latest ROCm version for target: $s3_target"
94-
s3_response=$(curl -s "https://therock-nightly-tarball.s3.amazonaws.com/?prefix=therock-dist-linux-${s3_target}-7")
95-
96-
files=$(echo "$s3_response" | grep -oP '(?<=<Key>)[^<]*' | grep "therock-dist-linux-${s3_target}-")
80+
echo "Auto-detecting latest multiarch ROCm version"
81+
# The multiarch host serves an HTML index (not S3 XML); scrape the
82+
# multiarch tarball names from it.
83+
files=$(curl -s "$base_url/" \
84+
| grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \
85+
| sort -u)
9786
9887
latest_file=""
9988
latest_major=0
@@ -103,7 +92,7 @@ jobs:
10392
latest_is_alpha=false
10493
10594
while IFS= read -r file; do
106-
if [[ "$file" =~ therock-dist-linux-${s3_target}-.*?([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
95+
if [[ "$file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
10796
version="${BASH_REMATCH[1]}"
10897
major=$(echo "$version" | cut -d. -f1)
10998
minor=$(echo "$version" | cut -d. -f2)
@@ -142,25 +131,47 @@ jobs:
142131
143132
echo "Found latest file: $latest_file"
144133
145-
if [[ "$latest_file" =~ therock-dist-linux-${s3_target}-.*?([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
134+
if [[ "$latest_file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
146135
rocm_version="${BASH_REMATCH[1]}"
147136
echo "Detected latest ROCm version: $rocm_version"
148137
else
149138
echo "Failed to extract ROCm version from latest file: $latest_file"
150-
echo "Expected pattern: therock-dist-linux-${s3_target}-*<version>.tar.gz"
139+
echo "Expected pattern: therock-dist-linux-multiarch-<version>.tar.gz"
151140
exit 1
152141
fi
153-
154-
rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/$latest_file"
155-
else
156-
rocm_url="https://therock-nightly-tarball.s3.amazonaws.com/therock-dist-linux-${s3_target}-${rocm_version}.tar.gz"
157142
fi
158143
144+
rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz"
159145
echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV
160146
161-
echo "Streaming ROCm from: $rocm_url directly to extraction"
147+
# The multiarch tarball (~11.5 GB) ships device code for ALL 26 GPU
148+
# arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs.
149+
# This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and
150+
# uses the GEMM (Tensile) path, which works without .kpack files. So we
151+
# stream-extract and prune at the tar level: drop ALL .kpack, and drop the
152+
# Tensile DBs of every arch not in our target set. This keeps the runner
153+
# disk footprint small (the 11.5 GB is streamed, never stored) and yields
154+
# a lean multiarch package. tar matches --exclude on pre-strip member
155+
# names, hence the leading "./".
156+
drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \
157+
gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \
158+
gfx1152 gfx1200 gfx1201"
159+
excludes=(--exclude='./.kpack' --exclude='./.kpack/*')
160+
for a in $drop_arches; do
161+
excludes+=("--exclude=./lib/*/library/${a}")
162+
excludes+=("--exclude=./lib/*/library/${a}/*")
163+
excludes+=("--exclude=./lib/*/library/*${a}*")
164+
done
165+
166+
echo "Streaming multiarch ROCm from: $rocm_url (pruning .kpack + non-target arches)"
162167
sudo mkdir -p /opt/rocm
163-
curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - -C /opt/rocm --strip-components=1
168+
curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \
169+
-C /opt/rocm --strip-components=1 "${excludes[@]}"
170+
171+
echo "Retained rocBLAS Tensile arch dirs:"
172+
ls /opt/rocm/lib/rocblas/library/ 2>/dev/null || echo "(none)"
173+
echo "Retained hipBLASLt Tensile arch dirs:"
174+
ls /opt/rocm/lib/hipblaslt/library/ 2>/dev/null || echo "(none)"
164175
165176
- name: Set ROCm environment variables
166177
run: |
@@ -189,9 +200,8 @@ jobs:
189200
190201
- name: Build Llama.cpp + ROCm
191202
run: |
192-
current_target="${{ matrix.gfx_target }}"
193-
gpu_targets="${{ matrix.gpu_targets }}"
194-
echo "Building for target: $current_target (GPU_TARGETS=$gpu_targets)"
203+
gpu_targets="${{ env.GPU_TARGETS }}"
204+
echo "Building multiarch binary (GPU_TARGETS=$gpu_targets)"
195205
196206
mkdir build
197207
cd build
@@ -300,7 +310,7 @@ jobs:
300310
- name: Upload build artifacts
301311
uses: actions/upload-artifact@v4
302312
with:
303-
name: llama-ubuntu-rocm-${{ matrix.gfx_target }}-x64
313+
name: llama-ubuntu-rocm-multiarch-x64
304314
path: build/bin/
305315
retention-days: 30
306316

@@ -316,19 +326,10 @@ jobs:
316326
test-gfx:
317327
needs: build-ubuntu
318328
if: needs.build-ubuntu.result == 'success'
319-
runs-on: ${{ matrix.runner }}
320-
strategy:
321-
matrix:
322-
include:
323-
- gfx_target: gfx1151
324-
runner: linux-gfx1151-gpu-rocm
325-
# gfx1150 test temporarily disabled: the linux-gfx1150-gpu-rocm
326-
# runner (Bangalore box) hangs in llama-cli GPU inference while the
327-
# identical artifact/command passes on gfx1151 in seconds. Re-enable
328-
# once the runner's GPU/driver issue is resolved.
329-
# - gfx_target: gfx1150
330-
# runner: linux-gfx1150-gpu-rocm
331-
fail-fast: false
329+
# Single hardware test of the multiarch artifact on gfx1151. This is the
330+
# end-to-end safety net for the Tensile-only multiarch package: a real
331+
# llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device.
332+
runs-on: linux-gfx1151-gpu-rocm
332333

333334
steps:
334335
- name: Checkout repository
@@ -337,7 +338,7 @@ jobs:
337338
- name: Download build artifacts
338339
uses: actions/download-artifact@v4
339340
with:
340-
name: llama-ubuntu-rocm-${{ matrix.gfx_target }}-x64
341+
name: llama-ubuntu-rocm-multiarch-x64
341342
path: llama-binaries
342343

343344
- name: Download test model
@@ -380,7 +381,7 @@ jobs:
380381
# Use a prompt with a single correct answer and greedy decoding
381382
# (--temp 0) so the result is deterministic and verifiable.
382383
prompt="What is 2 + 2? Reply with only the number."
383-
echo "Running llama-cli test for ${{ matrix.gfx_target }}..."
384+
echo "Running llama-cli test for gfx1151 (multiarch artifact)..."
384385
echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v"
385386
386387
# Bound the run: a healthy 0.6B inference finishes in seconds. If the
@@ -469,12 +470,11 @@ jobs:
469470
contents: write
470471
# Publish only on the nightly dispatch (external cron passes
471472
# -f create_release=true). Push/PR and manual runs never release.
472-
# Require the build to succeed and tests to pass-or-skip (gfx1150 test is
473-
# currently skipped; its build artifact is still published).
473+
# Require the build to succeed and the gfx1151 hardware test to pass.
474474
if: |
475475
always() &&
476476
needs.build-ubuntu.result == 'success' &&
477-
(needs.test-gfx.result == 'success' || needs.test-gfx.result == 'skipped') &&
477+
needs.test-gfx.result == 'success' &&
478478
github.event_name == 'workflow_dispatch' &&
479479
github.event.inputs.create_release == 'true'
480480
steps:
@@ -505,21 +505,20 @@ jobs:
505505
fi
506506
echo "Release tag: $TAG"
507507
508-
- name: Create per-target archives
508+
- name: Create multiarch archive
509509
if: steps.generate-tag.outputs.tag_exists == 'false'
510510
run: |
511511
TAG="${{ steps.generate-tag.outputs.tag }}"
512512
root="$PWD"
513-
for target in gfx1151 gfx1150 gfx1153 gfx110X; do
514-
artifact_dir="./all-artifacts/llama-ubuntu-rocm-${target}-x64"
515-
archive="llama-${TAG}-ubuntu-rocm-${target}-x64"
516-
if [ -d "$artifact_dir" ]; then
517-
echo "Creating ${archive}.tar.gz"
518-
tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" .
519-
else
520-
echo "Warning: artifact dir not found: $artifact_dir"
521-
fi
522-
done
513+
artifact_dir="./all-artifacts/llama-ubuntu-rocm-multiarch-x64"
514+
archive="llama-${TAG}-ubuntu-rocm-multiarch-x64"
515+
if [ -d "$artifact_dir" ]; then
516+
echo "Creating ${archive}.tar.gz"
517+
tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" .
518+
else
519+
echo "ERROR: artifact dir not found: $artifact_dir"
520+
exit 1
521+
fi
523522
ls -la *.tar.gz
524523
525524
- name: Create GitHub Release
@@ -535,10 +534,10 @@ jobs:
535534
--title "$TAG" \
536535
--notes "**Build**: $TAG
537536
**OS**: ubuntu
538-
**GPU Target(s)**: gfx1151, gfx1150, gfx1153, gfx110X (gfx1100/1101/1102/1103)
539-
**ROCm Version**: $ROCM_VERSION
537+
**GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary)
538+
**ROCm Version**: $ROCM_VERSION (multiarch)
540539
**Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH
541540
**Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC')
542541
543-
Prebuilt llama.cpp ROCm binaries for the RDNA3.5 gfx115x APUs (gfx1151/gfx1150/gfx1153) and the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M), with ROCm runtime libraries bundled." \
542+
Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multiarch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \
544543
*.tar.gz

0 commit comments

Comments
 (0)