gfx11: sync ROCm CI + RDNA3.5 MMQ device table onto upstream-synced master #44
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Build gfx11 + ROCm | |
| on: | |
| push: | |
| branches: [gfx11] | |
| pull_request: | |
| types: [opened, synchronize, reopened] | |
| workflow_dispatch: | |
| inputs: | |
| rocm_version: | |
| description: 'ROCm version to use (e.g., 7.14.0a20260608) or "latest" to auto-detect' | |
| required: false | |
| default: '7.14.0a20260608' | |
| create_release: | |
| description: 'Publish a dated GitHub Release (b<YYYYMMDD>) with the built binaries. Set true only for the nightly dispatch.' | |
| required: false | |
| default: 'false' | |
| # Pinned to ROCm 7.14.0a20260608: last known-good nightly (libhsa-runtime64 build | |
| # d34cbb6409). The 7.14.0a20260609 and newer nightlies regressed libhsa-runtime64 | |
| # (build 1b2a555677), which segfaults in GpuAgent::InitDma on the gfx115x runners. | |
| # See https://github.com/ROCm/TheRock/issues/5763. Pass rocm_version=latest manually | |
| # to track the newest nightly. | |
| # | |
| # Nightly is driven externally (GitHub cron only fires from the default branch). | |
| # On a host you control, with `gh` authenticated (token scope: repo + workflow): | |
| # 0 13 * * * gh workflow run build-gfx11-rocm.yml --repo ROCm/llama.cpp --ref gfx11 -f rocm_version=7.14.0a20260608 | |
| env: | |
| ROCM_VERSION: ${{ github.event.inputs.rocm_version || '7.14.0a20260608' }} | |
| jobs: | |
| build-ubuntu: | |
| runs-on: ubuntu-24.04 | |
| # Single multiarch build: one fat binary covering all current CI arches, | |
| # sourced from TheRock's multiarch tarball (arch-neutral host + per-arch | |
| # Tensile DBs). gfx1100-1103 (RDNA3 desktop + Hawk Point/Phoenix 760M/780M), | |
| # gfx1150/1151/1153 (RDNA3.5 Strix APUs). | |
| env: | |
| GPU_TARGETS: gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151;gfx1153 | |
| outputs: | |
| rocm_version: ${{ steps.set-outputs.outputs.rocm_version }} | |
| llamacpp_commit_hash: ${{ steps.set-outputs.outputs.llamacpp_commit_hash }} | |
| steps: | |
| - name: Free disk space | |
| # Remove unused runner files to free up disk space | |
| run: curl -fsSL https://raw.githubusercontent.com/kou/arrow/e49d8ae15583ceff03237571569099a6ad62be32/ci/scripts/util_free_space.sh | bash | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Clean up existing ROCm directory (safety precaution) | |
| run: | | |
| if [ -d "/opt/rocm" ]; then | |
| echo "Removing existing /opt/rocm directory..." | |
| sudo rm -rf /opt/rocm | |
| fi | |
| if [ -f "rocm.tar.gz" ]; then | |
| rm -f rocm.tar.gz | |
| fi | |
| echo "Cleanup completed successfully" | |
| - name: Install build dependencies | |
| run: | | |
| echo "Installing build dependencies..." | |
| sudo apt update | |
| sudo apt install -y cmake ninja-build unzip curl | |
| echo "Verifying installations..." | |
| cmake --version | |
| ninja --version | |
| echo "Build dependencies installation completed" | |
| - name: Download and extract multiarch ROCm directly to /opt/rocm | |
| run: | | |
| rocm_version="${{ env.ROCM_VERSION }}" | |
| base_url="https://rocm.nightlies.amd.com/tarball-multi-arch" | |
| if [ "$rocm_version" = "latest" ]; then | |
| echo "Auto-detecting latest multiarch ROCm version" | |
| # The multiarch host serves an HTML index (not S3 XML); scrape the | |
| # multiarch tarball names from it. | |
| files=$(curl -s "$base_url/" \ | |
| | grep -oE 'therock-dist-linux-multiarch-[0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+\.tar\.gz' \ | |
| | sort -u) | |
| latest_file="" | |
| latest_major=0 | |
| latest_minor=0 | |
| latest_patch=0 | |
| latest_rc=0 | |
| latest_is_alpha=false | |
| while IFS= read -r file; do | |
| if [[ "$file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then | |
| version="${BASH_REMATCH[1]}" | |
| major=$(echo "$version" | cut -d. -f1) | |
| minor=$(echo "$version" | cut -d. -f2) | |
| patch=$(echo "$version" | cut -d. -f3 | sed 's/\(a\|rc\).*//') | |
| rc=$(echo "$version" | sed 's/.*\(a\|rc\)//') | |
| is_alpha=false | |
| if [[ "$version" =~ a ]]; then | |
| is_alpha=true | |
| fi | |
| is_newer=false | |
| if [ "$major" -gt "$latest_major" ]; then | |
| is_newer=true | |
| elif [ "$major" -eq "$latest_major" ] && [ "$minor" -gt "$latest_minor" ]; then | |
| is_newer=true | |
| elif [ "$major" -eq "$latest_major" ] && [ "$minor" -eq "$latest_minor" ] && [ "$patch" -gt "$latest_patch" ]; then | |
| is_newer=true | |
| elif [ "$major" -eq "$latest_major" ] && [ "$minor" -eq "$latest_minor" ] && [ "$patch" -eq "$latest_patch" ]; then | |
| if [ "$is_alpha" = true ] && [ "$latest_is_alpha" = false ]; then | |
| is_newer=true | |
| elif [ "$is_alpha" = "$latest_is_alpha" ] && [ "$rc" -gt "$latest_rc" ]; then | |
| is_newer=true | |
| fi | |
| fi | |
| if [ "$is_newer" = true ]; then | |
| latest_file="$file" | |
| latest_major="$major" | |
| latest_minor="$minor" | |
| latest_patch="$patch" | |
| latest_rc="$rc" | |
| latest_is_alpha="$is_alpha" | |
| fi | |
| fi | |
| done <<< "$files" | |
| echo "Found latest file: $latest_file" | |
| if [[ "$latest_file" =~ therock-dist-linux-multiarch-([0-9]+\.[0-9]+\.[0-9]+(a|rc)[0-9]+)\.tar\.gz ]]; then | |
| rocm_version="${BASH_REMATCH[1]}" | |
| echo "Detected latest ROCm version: $rocm_version" | |
| else | |
| echo "Failed to extract ROCm version from latest file: $latest_file" | |
| echo "Expected pattern: therock-dist-linux-multiarch-<version>.tar.gz" | |
| exit 1 | |
| fi | |
| fi | |
| rocm_url="$base_url/therock-dist-linux-multiarch-${rocm_version}.tar.gz" | |
| echo "DETECTED_ROCM_VERSION=$rocm_version" >> $GITHUB_ENV | |
| # The multiarch tarball (~11.5 GB) ships device code for ALL 26 GPU | |
| # arches: every arch's .kpack plus per-arch rocBLAS/hipBLASLt Tensile DBs. | |
| # This consumer build only needs the 7 RDNA3/3.5 arches in GPU_TARGETS and | |
| # uses the GEMM (Tensile) path, which works without .kpack files. So we | |
| # stream-extract and prune at the tar level: drop ALL .kpack, and drop the | |
| # Tensile DBs of every arch not in our target set. This keeps the runner | |
| # disk footprint small (the 11.5 GB is streamed, never stored) and yields | |
| # a lean multiarch package. tar matches --exclude on pre-strip member | |
| # names, hence the leading "./". | |
| drop_arches="gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 \ | |
| gfx1010 gfx1011 gfx1012 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 \ | |
| gfx1152 gfx1200 gfx1201" | |
| excludes=(--exclude='./.kpack' --exclude='./.kpack/*') | |
| for a in $drop_arches; do | |
| excludes+=("--exclude=./lib/*/library/${a}") | |
| excludes+=("--exclude=./lib/*/library/${a}/*") | |
| excludes+=("--exclude=./lib/*/library/*${a}*") | |
| done | |
| echo "Streaming multiarch ROCm from: $rocm_url (pruning .kpack + non-target arches)" | |
| sudo mkdir -p /opt/rocm | |
| curl -sL "$rocm_url" | sudo tar --use-compress-program=gzip -xf - \ | |
| -C /opt/rocm --strip-components=1 "${excludes[@]}" | |
| echo "Retained rocBLAS Tensile arch dirs:" | |
| ls /opt/rocm/lib/rocblas/library/ 2>/dev/null || echo "(none)" | |
| echo "Retained hipBLASLt Tensile arch dirs:" | |
| ls /opt/rocm/lib/hipblaslt/library/ 2>/dev/null || echo "(none)" | |
| - name: Set ROCm environment variables | |
| run: | | |
| echo "Setting ROCm environment variables..." | |
| echo "HIP_PATH=/opt/rocm" >> $GITHUB_ENV | |
| echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV | |
| echo "HIP_PLATFORM=amd" >> $GITHUB_ENV | |
| echo "HIP_CLANG_PATH=/opt/rocm/llvm/bin" >> $GITHUB_ENV | |
| echo "HIP_INCLUDE_PATH=/opt/rocm/include" >> $GITHUB_ENV | |
| echo "HIP_LIB_PATH=/opt/rocm/lib" >> $GITHUB_ENV | |
| echo "HIP_DEVICE_LIB_PATH=/opt/rocm/lib/llvm/amdgcn/bitcode" >> $GITHUB_ENV | |
| echo "/opt/rocm/bin:/opt/rocm/llvm/bin:$PATH" >> $GITHUB_PATH | |
| echo "LD_LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:/opt/rocm/llvm/lib:${LD_LIBRARY_PATH:-}" >> $GITHUB_ENV | |
| echo "LIBRARY_PATH=/opt/rocm/lib:/opt/rocm/lib64:${LIBRARY_PATH:-}" >> $GITHUB_ENV | |
| echo "CPATH=/opt/rocm/include:${CPATH:-}" >> $GITHUB_ENV | |
| echo "PKG_CONFIG_PATH=/opt/rocm/lib/pkgconfig:${PKG_CONFIG_PATH:-}" >> $GITHUB_ENV | |
| echo "ROCm environment variables set successfully" | |
| - name: Record llama.cpp commit hash | |
| run: | | |
| commit_hash=$(git rev-parse --short=5 HEAD) | |
| echo "LLAMACPP_COMMIT_HASH=$commit_hash" >> $GITHUB_ENV | |
| echo "llama.cpp commit hash (5 digits): $commit_hash" | |
| echo "Current llama.cpp commit:" | |
| git log --oneline -1 | |
| - name: Build Llama.cpp + ROCm | |
| run: | | |
| gpu_targets="${{ env.GPU_TARGETS }}" | |
| echo "Building multiarch binary (GPU_TARGETS=$gpu_targets)" | |
| mkdir build | |
| cd build | |
| cmake .. -G Ninja \ | |
| -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ | |
| -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ | |
| -DCMAKE_CXX_FLAGS="-I/opt/rocm/include" \ | |
| -DCMAKE_CROSSCOMPILING=ON \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DGPU_TARGETS="$gpu_targets" \ | |
| -DBUILD_SHARED_LIBS=ON \ | |
| -DLLAMA_BUILD_TESTS=OFF \ | |
| -DGGML_HIP=ON \ | |
| -DGGML_OPENMP=OFF \ | |
| -DGGML_CUDA_FORCE_CUBLAS=OFF \ | |
| -DGGML_RPC=ON \ | |
| -DGGML_HIP_ROCWMMA_FATTN=OFF \ | |
| -DLLAMA_BUILD_BORINGSSL=ON \ | |
| -DGGML_NATIVE=OFF \ | |
| -DGGML_STATIC=OFF \ | |
| -DCMAKE_SYSTEM_NAME=Linux | |
| cmake --build . -j $(nproc) | |
| - name: Copy ROCm core libs to build directory | |
| run: | | |
| build_bin_path="build/bin" | |
| # Copy the rocblas/library folder and all its contents | |
| rocblas_lib_path="/opt/rocm/lib/rocblas/library" | |
| if [ -d "$rocblas_lib_path" ]; then | |
| echo "Copying rocblas/library folder and all contents..." | |
| dest_rocblas_path="$build_bin_path/rocblas/library" | |
| mkdir -p "$(dirname "$dest_rocblas_path")" | |
| cp -r "$rocblas_lib_path" "$(dirname "$dest_rocblas_path")/" | |
| echo "Copied: rocblas/library folder with all contents" | |
| else | |
| echo "Warning: rocblas/library folder not found at: $rocblas_lib_path" | |
| fi | |
| # Copy the hipblaslt/library folder and all its contents | |
| hipblaslt_lib_path="/opt/rocm/lib/hipblaslt/library" | |
| if [ -d "$hipblaslt_lib_path" ]; then | |
| echo "Copying hipblaslt/library folder and all contents..." | |
| dest_hipblaslt_path="$build_bin_path/hipblaslt/library" | |
| mkdir -p "$(dirname "$dest_hipblaslt_path")" | |
| cp -r "$hipblaslt_lib_path" "$(dirname "$dest_hipblaslt_path")/" | |
| echo "Copied: hipblaslt/library folder with all contents" | |
| else | |
| echo "Warning: hipblaslt/library folder not found at: $hipblaslt_lib_path" | |
| fi | |
| # Copy required ROCm libraries to build directory | |
| # If artifacts from ROCm or Llama.cpp change, you may need to update this list. | |
| # To regenerate the list, run: | |
| # utils/gather_required_libs.py --rocm-dir /opt/rocm --dest-dir build/bin | |
| echo "Copying required ROCm libraries to build directory..." | |
| cp -v /opt/rocm/lib/libhipblas.so* "$build_bin_path/" 2>/dev/null || echo "libhipblas.so* not found" | |
| cp -v /opt/rocm/lib/librocblas.so* "$build_bin_path/" 2>/dev/null || echo "librocblas.so* not found" | |
| cp -v /opt/rocm/lib/libamdhip64.so* "$build_bin_path/" 2>/dev/null || echo "libamdhip64.so* not found" | |
| cp -v /opt/rocm/lib/librocsolver.so* "$build_bin_path/" 2>/dev/null || echo "librocsolver.so* not found" | |
| cp -v /opt/rocm/lib/libroctx64.so* "$build_bin_path/" 2>/dev/null || echo "libroctx64.so* not found" | |
| cp -v /opt/rocm/lib/libhipblaslt.so* "$build_bin_path/" 2>/dev/null || echo "libhipblaslt.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_liblzma.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_liblzma.so* not found" | |
| cp -v /opt/rocm/lib/librocprofiler-register.so* "$build_bin_path/" 2>/dev/null || echo "librocprofiler-register.so* not found" | |
| cp -v /opt/rocm/lib/libamd_comgr.so* "$build_bin_path/" 2>/dev/null || echo "libamd_comgr.so* not found" | |
| cp -v /opt/rocm/lib/libamd_comgr_loader.so* "$build_bin_path/" 2>/dev/null || echo "libamd_comgr_loader.so* not found" | |
| cp -v /opt/rocm/lib/libhsa-runtime64.so* "$build_bin_path/" 2>/dev/null || echo "libhsa-runtime64.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_numa.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_numa.so* not found" | |
| cp -v /opt/rocm/lib/librocroller.so* "$build_bin_path/" 2>/dev/null || echo "librocroller.so* not found" | |
| cp -v /opt/rocm/lib/librocm_kpack.so* "$build_bin_path/" 2>/dev/null || echo "librocm_kpack.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_z.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_z.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_zstd.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_zstd.so* not found" | |
| cp -v /opt/rocm/lib/llvm/lib/libLLVM.so* "$build_bin_path/" 2>/dev/null || echo "libLLVM.so* not found" | |
| cp -v /opt/rocm/lib/llvm/lib/libclang-cpp.so* "$build_bin_path/" 2>/dev/null || echo "libclang-cpp.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_elf.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_elf.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_drm.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_drm.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_drm_amdgpu.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_drm_amdgpu.so* not found" | |
| cp -v /opt/rocm/lib/rocm_sysdeps/lib/librocm_sysdeps_bz2.so* "$build_bin_path/" 2>/dev/null || echo "librocm_sysdeps_bz2.so* not found" | |
| # Bundle libatomic (gcc runtime dep of llama-cli). Some self-hosted GPU | |
| # runners don't have libatomic1 installed, so ship it with the artifact; | |
| # the $ORIGIN RPATH step below makes the binary load this bundled copy. | |
| cp -v /usr/lib/x86_64-linux-gnu/libatomic.so.1* "$build_bin_path/" 2>/dev/null \ | |
| || cp -v "$(gcc -print-file-name=libatomic.so.1)" "$build_bin_path/" 2>/dev/null \ | |
| || echo "libatomic.so.1 not found" | |
| echo "Finished copying required ROCm libraries" | |
| - name: Set RPATH for portable distribution | |
| run: | | |
| sudo apt-get install -y patchelf | |
| cd build/bin | |
| # Set RPATH to $ORIGIN so all libraries (including the comgr stub loader) find deps locally | |
| for file in *.so* llama-*; do | |
| [ -f "$file" ] && [ ! -L "$file" ] && patchelf --set-rpath '$ORIGIN' "$file" 2>/dev/null || true | |
| done | |
| - name: List build artifacts (including ROCm files) | |
| run: | | |
| cd build/bin | |
| echo "Final build artifacts (including ROCm library files):" | |
| ls -la | |
| - name: Upload build artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: llama-ubuntu-rocm-multiarch-x64 | |
| path: build/bin/ | |
| retention-days: 30 | |
| - name: Set job outputs | |
| id: set-outputs | |
| run: | | |
| rocm_version="${DETECTED_ROCM_VERSION:-${{ env.ROCM_VERSION }}}" | |
| echo "rocm_version=$rocm_version" >> $GITHUB_OUTPUT | |
| echo "llamacpp_commit_hash=${LLAMACPP_COMMIT_HASH}" >> $GITHUB_OUTPUT | |
| echo "Final rocm_version: $rocm_version" | |
| echo "Final llamacpp_commit_hash: ${LLAMACPP_COMMIT_HASH}" | |
| test-gfx: | |
| needs: build-ubuntu | |
| if: needs.build-ubuntu.result == 'success' | |
| # Single hardware test of the multiarch artifact on gfx1151. This is the | |
| # end-to-end safety net for the Tensile-only multiarch package: a real | |
| # llama-cli inference exercising the rocBLAS/hipBLASLt GEMM path on-device. | |
| runs-on: linux-gfx1151-gpu-rocm | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: llama-ubuntu-rocm-multiarch-x64 | |
| path: llama-binaries | |
| - name: Download test model | |
| run: | | |
| # Pinned to a fixed GitHub release asset instead of huggingface.co: some | |
| # self-hosted runners cannot reach huggingface.co (curl 35, connection | |
| # reset), but they can reach GitHub. Source: | |
| # https://github.com/jimw567/llamacpp-test-assets/releases/tag/test-assets | |
| model_url="https://github.com/jimw567/llamacpp-test-assets/releases/download/test-assets/Qwen3-0.6B-Q4_0.gguf" | |
| model_path="Qwen3-0.6B-Q4_0.gguf" | |
| echo "Downloading test model from: $model_url" | |
| curl -fL --retry 5 --retry-all-errors --retry-delay 5 -o "$model_path" "$model_url" | |
| if [ -f "$model_path" ]; then | |
| file_size=$(stat -c%s "$model_path") | |
| echo "Model downloaded successfully. Size: $file_size bytes" | |
| else | |
| echo "Failed to download model" | |
| exit 1 | |
| fi | |
| - name: Set up library path | |
| run: | | |
| echo "LD_LIBRARY_PATH=$(pwd)/llama-binaries:$LD_LIBRARY_PATH" >> $GITHUB_ENV | |
| - name: Run llama-cli test | |
| run: | | |
| llama_cli_path="./llama-binaries/llama-cli" | |
| model_path="Qwen3-0.6B-Q4_0.gguf" | |
| output_file="llama_output.txt" | |
| chmod +x "$llama_cli_path" | |
| if [ ! -f "$llama_cli_path" ]; then | |
| echo "llama-cli not found at: $llama_cli_path" | |
| echo "Available files in llama-binaries:" | |
| find llama-binaries -type f | |
| exit 1 | |
| fi | |
| # Use a prompt with a single correct answer and greedy decoding | |
| # (--temp 0) so the result is deterministic and verifiable. | |
| prompt="What is 2 + 2? Reply with only the number." | |
| echo "Running llama-cli test for gfx1151 (multiarch artifact)..." | |
| echo "Command: $llama_cli_path -m \"$model_path\" -ngl 99 --temp 0 -p \"$prompt\" -st -v" | |
| # Bound the run: a healthy 0.6B inference finishes in seconds. If the | |
| # GPU kernel launch hangs (driver/runner issue) the step would otherwise | |
| # never end and GitHub would never publish the logs. timeout forces a | |
| # clean failure and lets the captured output flush for diagnosis. | |
| set +e | |
| timeout 180 "$llama_cli_path" -m "$model_path" -ngl 99 --temp 0 -p "$prompt" -st -v > "$output_file" 2>&1 | |
| exit_code=$? | |
| set -e | |
| echo "=== LLAMA-CLI OUTPUT ===" | |
| if [ -f "$output_file" ] && [ -s "$output_file" ]; then | |
| cat "$output_file" | |
| else | |
| echo "(empty)" | |
| fi | |
| echo "=== END OUTPUT ===" | |
| echo "Process exit code: $exit_code" | |
| if [ $exit_code -eq 124 ]; then | |
| echo "❌ llama-cli timed out after 180s (GPU kernel likely hung)" | |
| echo "=== GPU / driver diagnostics ===" | |
| rocminfo 2>&1 | grep -iE 'Name|Marketing|gfx|Uuid' | head -40 || echo "rocminfo failed" | |
| (dmesg 2>/dev/null | tail -40) || echo "dmesg unavailable" | |
| rm -f "$output_file" | |
| exit 1 | |
| fi | |
| if [ $exit_code -ne 0 ]; then | |
| echo "❌ llama-cli exited with error code: $exit_code" | |
| echo "Checking for missing library dependencies..." | |
| ldd "$llama_cli_path" || echo "ldd command failed" | |
| rm -f "$output_file" | |
| exit 1 | |
| fi | |
| # Functional checks against current llama.cpp output: | |
| # (1) the ROCm GPU was selected, | |
| # (2) the model layers were offloaded to it (incl. the output layer), | |
| # (3) the model computed the correct answer to "What is 2 + 2?". | |
| # With greedy decoding (--temp 0) the answer is deterministic, so this | |
| # verifies the GPU math path end to end, not just that text was emitted. | |
| found_device=false | |
| found_offload=false | |
| found_answer=false | |
| if grep -q "using device ROCm0" "$output_file"; then | |
| found_device=true | |
| fi | |
| # Qwen3-0.6B has 28 transformer layers + output = 29 GPU assignments per | |
| # pass (verbose mode logs them more than once; require at least one pass). | |
| layers_on_gpu=$(grep -c "assigned to device ROCm0" "$output_file" || true) | |
| if [ "$layers_on_gpu" -ge 29 ] && grep -q "offloading output layer to GPU" "$output_file"; then | |
| found_offload=true | |
| fi | |
| # The parsed assistant answer ("content" field, printed with -v) must | |
| # contain 4 and no other digit, so "4" / "The answer is 4." pass while | |
| # "5", "14", "22" fail. This is the deterministic correctness check. | |
| answer_content=$(grep -oE '"content":"[^"]*"' "$output_file" | tail -1) | |
| if echo "$answer_content" | grep -qE '4' && ! echo "$answer_content" | grep -qE '[0-35-9]'; then | |
| found_answer=true | |
| fi | |
| echo "=== TEST RESULTS ===" | |
| echo "ROCm GPU selected ('using device ROCm0'): $(if [ "$found_device" = true ]; then echo 'FOUND'; else echo 'NOT FOUND'; fi)" | |
| echo "Layers offloaded to GPU ('assigned to device ROCm0' x$layers_on_gpu + output layer): $(if [ "$found_offload" = true ]; then echo 'FOUND'; else echo 'NOT FOUND'; fi)" | |
| echo "Correct answer to '2 + 2' (parsed content: ${answer_content:-<none>}): $(if [ "$found_answer" = true ]; then echo 'FOUND'; else echo 'NOT FOUND'; fi)" | |
| rm -f "$output_file" | |
| if [ "$found_device" = true ] && [ "$found_offload" = true ] \ | |
| && [ "$found_answer" = true ]; then | |
| echo "✅ Test PASSED - GPU offload + correct deterministic answer verified" | |
| else | |
| echo "❌ Test FAILED - Missing expected outputs" | |
| exit 1 | |
| fi | |
| create-release: | |
| needs: [build-ubuntu, test-gfx] | |
| runs-on: ubuntu-24.04 | |
| permissions: | |
| contents: write | |
| # Publish only on the nightly dispatch (external cron passes | |
| # -f create_release=true). Push/PR and manual runs never release. | |
| # Require the build to succeed and the gfx1151 hardware test to pass. | |
| if: | | |
| always() && | |
| needs.build-ubuntu.result == 'success' && | |
| needs.test-gfx.result == 'success' && | |
| github.event_name == 'workflow_dispatch' && | |
| github.event.inputs.create_release == 'true' | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Download all build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: ./all-artifacts | |
| - name: Generate dated release tag | |
| id: generate-tag | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| run: | | |
| # Date tag mirrors the ROCm nightly scheme (e.g. b20260609 for the | |
| # 7.14.0a20260609-era build). One release per UTC day. | |
| TAG="b$(date -u '+%Y%m%d')" | |
| echo "tag=${TAG}" >> $GITHUB_OUTPUT | |
| if gh release view "$TAG" --repo "$GITHUB_REPOSITORY" >/dev/null 2>&1; then | |
| echo "Release $TAG already exists; skipping creation." | |
| echo "tag_exists=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "Release $TAG does not exist; will create." | |
| echo "tag_exists=false" >> $GITHUB_OUTPUT | |
| fi | |
| echo "Release tag: $TAG" | |
| - name: Create multiarch archive | |
| if: steps.generate-tag.outputs.tag_exists == 'false' | |
| run: | | |
| TAG="${{ steps.generate-tag.outputs.tag }}" | |
| root="$PWD" | |
| artifact_dir="./all-artifacts/llama-ubuntu-rocm-multiarch-x64" | |
| archive="llama-${TAG}-ubuntu-rocm-multiarch-x64" | |
| if [ -d "$artifact_dir" ]; then | |
| echo "Creating ${archive}.tar.gz" | |
| tar -czf "$root/${archive}.tar.gz" -C "$artifact_dir" . | |
| else | |
| echo "ERROR: artifact dir not found: $artifact_dir" | |
| exit 1 | |
| fi | |
| ls -la *.tar.gz | |
| - name: Create GitHub Release | |
| if: steps.generate-tag.outputs.tag_exists == 'false' | |
| env: | |
| GITHUB_TOKEN: ${{ github.token }} | |
| run: | | |
| TAG="${{ steps.generate-tag.outputs.tag }}" | |
| ROCM_VERSION="${{ needs.build-ubuntu.outputs.rocm_version }}" | |
| LLAMACPP_COMMIT_HASH="${{ needs.build-ubuntu.outputs.llamacpp_commit_hash }}" | |
| gh release create "$TAG" \ | |
| --repo "$GITHUB_REPOSITORY" \ | |
| --title "$TAG" \ | |
| --notes "**Build**: $TAG | |
| **OS**: ubuntu | |
| **GPU Target(s)**: gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1153 (single multiarch binary) | |
| **ROCm Version**: $ROCM_VERSION (multiarch) | |
| **Llama.cpp Commit**: $LLAMACPP_COMMIT_HASH | |
| **Build Date**: $(date -u '+%Y-%m-%d %H:%M:%S UTC') | |
| Prebuilt llama.cpp ROCm binaries — one multiarch package covering the RDNA3 gfx110X family incl. Hawk Point/Phoenix (Radeon 760M/780M) and the RDNA3.5 gfx115x APUs (gfx1150/gfx1151/gfx1153). Built from TheRock's multiarch ROCm runtime, pruned to the CI target arches (per-arch Tensile databases bundled)." \ | |
| *.tar.gz |