Skip to content

gfx11: sync ROCm CI + RDNA3.5 MMQ device table onto upstream-synced master #44

gfx11: sync ROCm CI + RDNA3.5 MMQ device table onto upstream-synced master

gfx11: sync ROCm CI + RDNA3.5 MMQ device table onto upstream-synced master #44

name: Build gfx11 + ROCm
on:
push:
branches: [gfx11]
pull_request:
types: [opened, synchronize, reopened]
workflow_dispatch:
inputs:
rocm_version:
description: 'ROCm version to use (e.g., 7.14.0a20260608) or "latest" to auto-detect'
required: false
default: '7.14.0a20260608'
create_release:
description: 'Publish a dated GitHub Release (b<YYYYMMDD>) with the built binaries. Set true only for the nightly dispatch.'
required: false
default: 'false'
# Pinned to ROCm 7.14.0a20260608: last known-good nightly (libhsa-runtime64 build
# d34cbb6409). The 7.14.0a20260609 and newer nightlies regressed libhsa-runtime64
# (build 1b2a555677), which segfaults in GpuAgent::InitDma on the gfx115x runners.
# See https://github.com/ROCm/TheRock/issues/5763. Pass rocm_version=latest manually
# to track the newest nightly.
#
# Nightly is driven externally (GitHub cron only fires from the default branch).
# On a host you control, with `gh` authenticated (token scope: repo + workflow):
# 0 13 * * * gh workflow run build-gfx11-rocm.yml --repo ROCm/llama.cpp --ref gfx11 -f rocm_version=7.14.0a20260608
env:
ROCM_VERSION: ${{ github.event.inputs.rocm_version || '7.14.0a20260608' }}
jobs:
build-ubuntu:
runs-on: ubuntu-24.04
# Single multiarch build: one fat binary covering all current CI arches,
# sourced from TheRock's multiarch tarball (arch-neutral host + per-arch
# Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M),
# gfx1150/1151/1153 (RDNA3.5 Strix APUs).
env:
GPU_TARGETS: gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1153
outputs:
rocm_version: ${{ steps.set-outputs.outputs.rocm_version }}
llamacpp_commit_hash: ${{ steps.set-outputs.outputs.llamacpp_commit_hash }}
steps:
- name: Free disk space
# Remove unused runner files to free up disk space
run: curl -fsSL https://raw.githubusercontent.com/kou/arrow/e49d8ae15583ceff03237571569099a6ad62be32/ci/scripts/util_free_space.sh | bash
- name: Checkout repository
uses: actions/checkout@v4
- name: Clean up existing ROCm directory (safety precaution)
run: |
if [ -d "/opt/rocm" ]; then
echo "Removing existing /opt/rocm directory..."
sudo rm -rf /opt/rocm
fi
if [ -f "rocm.tar.gz" ]; then
rm -f rocm.tar.gz
fi
echo "Cleanup completed successfully"
- name: Install build dependencies
run: |
echo "Installing build dependencies..."
sudo apt update
sudo apt install -y cmake ninja-build unzip curl
echo "Verifying installations..."
cmake --version
ninja --version
echo "Build dependencies installation completed"
- name: Download and extract multiarch ROCm directly to /opt/rocm
run: |
rocm_version="${{ env.ROCM_VERSION }}"
base_url="https://rocm.nightlies.amd.com/tarball-multi-arch"
if [ "$rocm_version" = "latest" ]; then
echo "Auto-detecting latest multiarch ROCm version"
# The multiarch host serves an HTML index (not S3 XML); scrape the
# multiarch tarball names from it.
files=$(curl -s "$base_url/" \
| grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \
| sort -u)
latest_file=""
latest_major=0
latest_minor=0
latest_patch=0
latest_rc=0
latest_is_alpha=false
while IFS= read -r file; do
if [[ "$file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
version="${BASH_REMATCH[1]}"
major=$(echo "$version" | cut -d. -f1)
minor=$(echo "$version" | cut -d. -f2)
patch=$(echo "$version" | cut -d. -f3 | sed 's/\(a\|rc\).*//')
rc=$(echo "$version" | sed 's/.*\(a\|rc\)//')
is_alpha=false
if [[ "$version" =~ a ]]; then
is_alpha=true
fi
is_newer=false
if [ "$major" -gt "$latest_major" ]; then
is_newer=true
elif [ "$major" -eq "$latest_major" ] && [ "$minor" -gt "$latest_minor" ]; then
is_newer=true
elif [ "$major" -eq "$latest_major" ] && [ "$minor" -eq "$latest_minor" ] && [ "$patch" -gt "$latest_patch" ]; then
is_newer=true
elif [ "$major" -eq "$latest_major" ] && [ "$minor" -eq "$latest_minor" ] && [ "$patch" -eq "$latest_patch" ]; then
if [ "$is_alpha" = true ] && [ "$latest_is_alpha" = false ]; then
is_newer=true
elif [ "$is_alpha" = "$latest_is_alpha" ] && [ "$rc" -gt "$latest_rc" ]; then
is_newer=true
fi
fi
if [ "$is_newer" = true ]; then
latest_file="$file"
latest_major="$major"
latest_minor="$minor"
latest_patch="$patch"
latest_rc="$rc"
latest_is_alpha="$is_alpha"
fi
fi
done <<< "$files"
echo "Found latest file: $latest_file"
if [[ "$latest_file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then
rocm_version="${BASH_REMATCH[1]}"
echo "Detected latest ROCm version: $rocm_version"
else
echo "Failed to extract ROCm version from latest file: $latest_file"
echo "Expected pattern: therock-dist-linux-multiarch-<version>.tar.gz"
exit 1
fi
fi
rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz"
echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV
# The multiarch tarball (~11.5 GB) ships device code for ALL 26 GPU
# arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs.
# This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and
# uses the GEMM (Tensile) path, which works without .kpack files. So we
# stream-extract and prune at the tar level: drop ALL .kpack, and drop the
# Tensile DBs of every arch not in our target set. This keeps the runner
# disk footprint small (the 11.5 GB is streamed, never stored) and yields
# a lean multiarch package. tar matches --exclude on pre-strip member
# names, hence the leading "./".
drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \
gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \
gfx1152 gfx1200 gfx1201"
excludes=(--exclude='./.kpack' --exclude='./.kpack/*')
for a in $drop_arches; do
excludes+=("--exclude=./lib/*/library/${a}")
excludes+=("--exclude=./lib/*/library/${a}/*")
excludes+=("--exclude=./lib/*/library/*${a}*")
done
echo "Streaming multiarch ROCm from: $rocm_url (pruning .kpack + non-target arches)"
sudo mkdir -p /opt/rocm
curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \
-C /opt/rocm --strip-components=1 "${excludes[@]}"
echo "Retained rocBLAS Tensile arch dirs:"
ls /opt/rocm/lib/rocblas/library/ 2>/dev/null || echo "(none)"
echo "Retained hipBLASLt Tensile arch dirs:"
ls /opt/rocm/lib/hipblaslt/library/ 2>/dev/null || echo "(none)"
- name: Set ROCm environment variables
run: |
echo "Setting ROCm environment variables..."
echo "HIP_PATH=/opt/rocm" >> $GITHUB_ENV
echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV
echo "HIP_PLATFORM=amd" >> $GITHUB_ENV
echo "HIP_CLANG_PATH=/opt/rocm/llvm/bin" >> $GITHUB_ENV
echo "HIP_INCLUDE_PATH=/opt/rocm/include" >> $GITHUB_ENV
echo "HIP_LIB_PATH=/opt/rocm/lib" >> $GITHUB_ENV
echo "HIP_DEVICE_LIB_PATH=/opt/rocm/lib/llvm/amdgcn/bitcode" >> $GITHUB_ENV
echo "/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH" >> $GITHUB_PATH
echo "LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:/opt/rocm/llvm/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV
echo "LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:${LIBRARY_PATH:-}" >> $GITHUB_ENV
echo "CPATH=/opt/rocm/include:${CPATH:-}" >> $GITHUB_ENV
echo "PKG_CONFIG_PATH=/opt/rocm/lib/pkgconfig:${PKG_CONFIG_PATH:-}" >> $GITHUB_ENV
echo "ROCm environment variables set successfully"
- name: Record llama.cpp commit hash
run: |
commit_hash=$(git rev-parse --short=5 HEAD)
echo "LLAMACPP_COMMIT_HASH=$commit_hash" >> $GITHUB_ENV
echo "llama.cpp commit hash (5 digits): $commit_hash"
echo "Current llama.cpp commit:"
git log --oneline -1
- name: Build Llama.cpp + ROCm
run: |
gpu_targets="${{ env.GPU_TARGETS }}"
echo "Building multiarch binary (GPU_TARGETS=$gpu_targets)"
mkdir build
cd build
cmake .. -G Ninja \
-DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \
-DCMAKE_CXX_FLAGS="-I/opt/rocm/include" \
-DCMAKE_CROSSCOMPILING=ON \
-DCMAKE_BUILD_TYPE=Release \
-DGPU_TARGETS="$gpu_targets" \
-DBUILD_SHARED_LIBS=ON \
-DLLAMA_BUILD_TESTS=OFF \
-DGGML_HIP=ON \
-DGGML_OPENMP=OFF \
-DGGML_CUDA_FORCE_CUBLAS=OFF \
-DGGML_RPC=ON \
-DGGML_HIP_ROCWMMA_FATTN=OFF \
-DLLAMA_BUILD_BORINGSSL=ON \
-DGGML_NATIVE=OFF \
-DGGML_STATIC=OFF \
-DCMAKE_SYSTEM_NAME=Linux
cmake --build . -j $(nproc)
- name: Copy ROCm core libs to build directory
run: |
build_bin_path="build/bin"
# Copy the rocblas/library folder and all its contents
rocblas_lib_path="/opt/rocm/lib/rocblas/library"
if [ -d "$rocblas_lib_path" ]; then
echo "Copying rocblas/library folder and all contents..."
dest_rocblas_path="$build_bin_path/rocblas/library"
mkdir -p "$(dirname "$dest_rocblas_path")"
cp -r "$rocblas_lib_path" "$(dirname "$dest_rocblas_path")/"
echo "Copied: rocblas/library folder with all contents"
else
echo "Warning: rocblas/library folder not found at: $rocblas_lib_path"
fi
# Copy the hipblaslt/library folder and all its contents
hipblaslt_lib_path="/opt/rocm/lib/hipblaslt/library"
if [ -d "$hipblaslt_lib_path" ]; then
echo "Copying hipblaslt/library folder and all contents..."
dest_hipblaslt_path="$build_bin_path/hipblaslt/library"
mkdir -p "$(dirname "$dest_hipblaslt_path")"
cp -r "$hipblaslt_lib_path" "$(dirname "$dest_hipblaslt_path")/"
echo "Copied: hipblaslt/library folder with all contents"
else
echo "Warning: hipblaslt/library folder not found at: $hipblaslt_lib_path"
fi
# Copy required ROCm libraries to build directory
# If artifacts from ROCm or Llama.cpp change, you may need to update this list.
# To regenerate the list, run:
# utils/gather_required_libs.py --rocm-dir /opt/rocm --dest-dir build/bin
echo "Copying required ROCm libraries to build directory..."
cp -v /opt/rocm/lib/libhipblas.so* "$build_bin_path/" 2>/dev/null || echo "libhipblas.so* not found"
cp -v /opt/rocm/lib/librocblas.so* "$build_bin_path/" 2>/dev/null || echo "librocblas.so* not found"
cp -v /opt/rocm/lib/libamdhip64.so* "$build_bin_path/" 2>/dev/null || echo "libamdhip64.so* not found"
cp -v /opt/rocm/lib/librocsolver.so* "$build_bin_path/" 2>/dev/null || echo "librocsolver.so* not found"
cp -v /opt/rocm/lib/libroctx64.so* "$build_bin_path/" 2>/dev/null || echo "libroctx64.so* not found"
cp -v /opt/rocm/lib/libhipblaslt.so* "$build_bin_path/" 2>/dev/null || echo "libhipblaslt.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_liblzma.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_liblzma.so* not found"
cp -v /opt/rocm/lib/librocprofiler-register.so* "$build_bin_path/" 2>/dev/null || echo "librocprofiler-register.so* not found"
cp -v /opt/rocm/lib/libamd_comgr.so* "$build_bin_path/" 2>/dev/null || echo "libamd_comgr.so* not found"
cp -v /opt/rocm/lib/libamd_comgr_loader.so* "$build_bin_path/" 2>/dev/null || echo "libamd_comgr_loader.so* not found"
cp -v /opt/rocm/lib/libhsa-runtime64.so* "$build_bin_path/" 2>/dev/null || echo "libhsa-runtime64.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_numa.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_numa.so* not found"
cp -v /opt/rocm/lib/librocroller.so* "$build_bin_path/" 2>/dev/null || echo "librocroller.so* not found"
cp -v /opt/rocm/lib/librocm_kpack.so* "$build_bin_path/" 2>/dev/null || echo "librocm_kpack.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_z.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_z.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_zstd.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_zstd.so* not found"
cp -v /opt/rocm/lib/llvm/lib/libLLVM.so* "$build_bin_path/" 2>/dev/null || echo "libLLVM.so* not found"
cp -v /opt/rocm/lib/llvm/lib/libclang-cpp.so* "$build_bin_path/" 2>/dev/null || echo "libclang-cpp.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_elf.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_elf.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_drm.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_drm.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_drm_amdgpu.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_drm_amdgpu.so* not found"
cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_bz2.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_bz2.so* not found"
# Bundle libatomic (gcc runtime dep of llama-cli). Some self-hosted GPU
# runners don't have libatomic1 installed, so ship it with the artifact;
# the $ORIGIN RPATH step below makes the binary load this bundled copy.
cp -v /usr/lib/x86_64-linux-gnu/libatomic.so.1* "$build_bin_path/" 2>/dev/null \
|| cp -v "$(gcc -print-file-name=libatomic.so.1)" "$build_bin_path/" 2>/dev/null \
|| echo "libatomic.so.1 not found"
echo "Finished copying required ROCm libraries"
- name: Set RPATH for portable distribution
run: |
sudo apt-get install -y patchelf
cd build/bin
# Set RPATH to $ORIGIN so all libraries (including the comgr stub loader) find deps locally
for file in *.so* llama-*; do
[ -f "$file" ] && [ ! -L "$file" ] && patchelf --set-rpath '$ORIGIN' "$file" 2>/dev/null || true
done
- name: List build artifacts (including ROCm files)
run: |
cd build/bin
echo "Final build artifacts (including ROCm library files):"
ls -la
- name: Upload build artifacts
uses: actions/upload-artifact@v4
with:
name: llama-ubuntu-rocm-multiarch-x64
path: build/bin/
retention-days: 30
- name: Set job outputs
id: set-outputs
run: |
rocm_version="${DETECTED_ROCM_VERSION:-${{ env.ROCM_VERSION }}}"
echo "rocm_version=$rocm_version" >> $GITHUB_OUTPUT
echo "llamacpp_commit_hash=${LLAMACPP_COMMIT_HASH}" >> $GITHUB_OUTPUT
echo "Final rocm_version: $rocm_version"
echo "Final llamacpp_commit_hash: ${LLAMACPP_COMMIT_HASH}"
test-gfx:
needs: build-ubuntu
if: needs.build-ubuntu.result == 'success'
# Single hardware test of the multiarch artifact on gfx1151. This is the
# end-to-end safety net for the Tensile-only multiarch package: a real
# llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device.
runs-on: linux-gfx1151-gpu-rocm
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: llama-ubuntu-rocm-multiarch-x64
path: llama-binaries
- name: Download test model
run: |
# Pinned to a fixed GitHub release asset instead of huggingface.co: some
# self-hosted runners cannot reach huggingface.co (curl 35, connection
# reset), but they can reach GitHub. Source:
# https://github.com/jimw567/llamacpp-test-assets/releases/tag/test-assets
model_url="https://github.com/jimw567/llamacpp-test-assets/releases/download/test-assets/Qwen3-0.6B-Q4_0.gguf"
model_path="Qwen3-0.6B-Q4_0.gguf"
echo "Downloading test model from: $model_url"
curl -fL --retry 5 --retry-all-errors --retry-delay 5 -o "$model_path" "$model_url"
if [ -f "$model_path" ]; then
file_size=$(stat -c%s "$model_path")
echo "Model downloaded successfully. Size: $file_size bytes"
else
echo "Failed to download model"
exit 1
fi
- name: Set up library path
run: |
echo "LD_LIBRARY_PATH=$(pwd)/llama-binaries:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- name: Run llama-cli test
run: |
llama_cli_path="./llama-binaries/llama-cli"
model_path="Qwen3-0.6B-Q4_0.gguf"
output_file="llama_output.txt"
chmod +x "$llama_cli_path"
if [ ! -f "$llama_cli_path" ]; then
echo "llama-cli not found at: $llama_cli_path"
echo "Available files in llama-binaries:"
find llama-binaries -type f
exit 1
fi
# Use a prompt with a single correct answer and greedy decoding
# (--temp 0) so the result is deterministic and verifiable.
prompt="What is 2 + 2? Reply with only the number."
echo "Running llama-cli test for gfx1151 (multiarch artifact)..."
echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v"
# Bound the run: a healthy 0.6B inference finishes in seconds. If the
# GPU kernel launch hangs (driver/runner issue) the step would otherwise
# never end and GitHub would never publish the logs. timeout forces a
# clean failure and lets the captured output flush for diagnosis.
set +e
timeout 180 "$llama_cli_path" -m "$model_path" -ngl 99 --temp 0 -p "$prompt" -st -v > "$output_file" 2>&1
exit_code=$?
set -e
echo "=== LLAMA-CLI OUTPUT ==="
if [ -f "$output_file" ] && [ -s "$output_file" ]; then
cat "$output_file"
else
echo "(empty)"
fi
echo "=== END OUTPUT ==="
echo "Process exit code: $exit_code"
if [ $exit_code -eq 124 ]; then
echo "❌ llama-cli timed out after 180s (GPU kernel likely hung)"
echo "=== GPU / driver diagnostics ==="
rocminfo 2>&1 | grep -iE 'Name|Marketing|gfx|Uuid' | head -40 || echo "rocminfo failed"
(dmesg 2>/dev/null | tail -40) || echo "dmesg unavailable"
rm -f "$output_file"
exit 1
fi
if [ $exit_code -ne 0 ]; then
echo "❌ llama-cli exited with error code: $exit_code"
echo "Checking for missing library dependencies..."
ldd "$llama_cli_path" || echo "ldd command failed"
rm -f "$output_file"
exit 1
fi
# Functional checks against current llama.cpp output:
# (1) the ROCm GPU was selected,
# (2) the model layers were offloaded to it (incl. the output layer),
# (3) the model computed the correct answer to "What is 2 + 2?".
# With greedy decoding (--temp 0) the answer is deterministic, so this
# verifies the GPU math path end to end, not just that text was emitted.
found_device=false
found_offload=false
found_answer=false
if grep -q "using device ROCm0" "$output_file"; then
found_device=true
fi
# Qwen3-0.6B has 28 transformer layers + output = 29 GPU assignments per
# pass (verbose mode logs them more than once; require at least one pass).
layers_on_gpu=$(grep -c "assigned to device ROCm0" "$output_file" || true)
if [ "$layers_on_gpu" -ge 29 ] && grep -q "offloading output layer to GPU" "$output_file"; then
found_offload=true
fi
# The parsed assistant answer ("content" field, printed with -v) must
# contain 4 and no other digit, so "4" / "The answer is 4." pass while
# "5", "14", "22" fail. This is the deterministic correctness check.
answer_content=$(grep -oE '"content":"[^"]*"' "$output_file" | tail -1)
if echo "$answer_content" | grep -qE '4' && ! echo "$answer_content" | grep -qE '[0-35-9]'; then
found_answer=true
fi
echo "=== TEST RESULTS ==="
echo "ROCm GPU selected ('using device ROCm0'): $(if [ "$found_device" = true ]; then echo 'FOUND'; else echo 'NOT FOUND'; fi)"
echo "Layers offloaded to GPU ('assigned to device ROCm0' x$layers_on_gpu + output layer): $(if [ "$found_offload" = true ]; then echo 'FOUND'; else echo 'NOT FOUND'; fi)"
echo "Correct answer to '2 + 2' (parsed content: ${answer_content:-<none>}): $(if [ "$found_answer" = true ]; then echo 'FOUND'; else echo 'NOT FOUND'; fi)"
rm -f "$output_file"
if [ "$found_device" = true ] && [ "$found_offload" = true ] \
&& [ "$found_answer" = true ]; then
echo "✅ Test PASSED - GPU offload + correct deterministic answer verified"
else
echo "❌ Test FAILED - Missing expected outputs"
exit 1
fi
create-release:
needs: [build-ubuntu, test-gfx]
runs-on: ubuntu-24.04
permissions:
contents: write
# Publish only on the nightly dispatch (external cron passes
# -f create_release=true). Push/PR and manual runs never release.
# Require the build to succeed and the gfx1151 hardware test to pass.
if: |
always() &&
needs.build-ubuntu.result == 'success' &&
needs.test-gfx.result == 'success' &&
github.event_name == 'workflow_dispatch' &&
github.event.inputs.create_release == 'true'
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Download all build artifacts
uses: actions/download-artifact@v4
with:
path: ./all-artifacts
- name: Generate dated release tag
id: generate-tag
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
# Date tag mirrors the ROCm nightly scheme (e.g. b20260609 for the
# 7.14.0a20260609-era build). One release per UTC day.
TAG="b$(date -u '+%Y%m%d')"
echo "tag=${TAG}" >> $GITHUB_OUTPUT
if gh release view "$TAG" --repo "$GITHUB_REPOSITORY" >/dev/null 2>&1; then
echo "Release $TAG already exists; skipping creation."
echo "tag_exists=true" >> $GITHUB_OUTPUT
else
echo "Release $TAG does not exist; will create."
echo "tag_exists=false" >> $GITHUB_OUTPUT
fi
echo "Release tag: $TAG"
- name: Create multiarch archive
if: steps.generate-tag.outputs.tag_exists == 'false'
run: |
TAG="${{ steps.generate-tag.outputs.tag }}"
root="$PWD"
artifact_dir="./all-artifacts/llama-ubuntu-rocm-multiarch-x64"
archive="llama-${TAG}-ubuntu-rocm-multiarch-x64"
if [ -d "$artifact_dir" ]; then
echo "Creating ${archive}.tar.gz"
tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" .
else
echo "ERROR: artifact dir not found: $artifact_dir"
exit 1
fi
ls -la *.tar.gz
- name: Create GitHub Release
if: steps.generate-tag.outputs.tag_exists == 'false'
env:
GITHUB_TOKEN: ${{ github.token }}
run: |
TAG="${{ steps.generate-tag.outputs.tag }}"
ROCM_VERSION="${{ needs.build-ubuntu.outputs.rocm_version }}"
LLAMACPP_COMMIT_HASH="${{ needs.build-ubuntu.outputs.llamacpp_commit_hash }}"
gh release create "$TAG" \
--repo "$GITHUB_REPOSITORY" \
--title "$TAG" \
--notes "**Build**: $TAG
**OS**: ubuntu
**GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary)
**ROCm Version**: $ROCM_VERSION (multiarch)
**Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH
**Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC')
Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multiarch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \
*.tar.gz