Skills Eval #108
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # Tier 3 Harbor eval for rag-* skills. | |
| # Three triggers: | |
| # 1. push to pull-request/* → diff-based: only eval skills changed in the PR | |
| # 2. schedule (nightly) → all rag-* cpu skills | |
| # 3. workflow_dispatch → manual trigger with skill selector | |
| # | |
| # Note: cve-fix skill requires NSPect (NVIDIA internal tool) — GitHub Actions | |
| # runners cannot reach it (not security approved). cve-fix runs via GitLab CI | |
| # on the mirror repo (gitlab-master.nvidia.com/chat-labs/OpenSource/rag). | |
| # | |
| # Uses dorny/paths-filter for cumulative PR diff (not per-push) — same | |
| # reason as skills-eval.yml. copy-pr-bot merge commits don't always | |
| # touch skills/ even when the PR does. | |
| name: Skills Eval | |
| on: | |
| push: | |
| branches: | |
| - "pull-request/[0-9]+" | |
| schedule: | |
| - cron: "0 2 * * *" # 2am UTC nightly — all rag-* cpu skills | |
| workflow_dispatch: | |
| inputs: | |
| skills: | |
| description: "Skill to eval (* for all, or pick one)" | |
| required: false | |
| default: "*" | |
| type: choice | |
| options: | |
| - "*" | |
| - rag-deploy-blueprint | |
| - rag-ingest-documents | |
| - rag-query-knowledge | |
| - rag-configure-infrastructure | |
| - rag-configure-retrieval | |
| - rag-evaluate-quality | |
| - rag-manage-mcp | |
| - rag-troubleshoot-blueprint | |
| - rag-enable-vlm | |
| - rag-enable-guardrails | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| concurrency: | |
| group: skills-eval-${{ github.ref }} | |
| cancel-in-progress: true | |
| defaults: | |
| run: | |
| shell: bash | |
| jobs: | |
| eval: | |
| name: Eval changed skills against PR | |
| runs-on: [self-hosted, rag-eval] | |
| # 4-hour cap: 8 cpu skills × ~15 min each = 2h max with 1.5x timeouts. | |
| # Nightly runs all skills; PR runs only changed ones so usually much faster. | |
| timeout-minutes: 240 | |
| if: startsWith(github.ref, 'refs/heads/pull-request/') || github.event_name == 'workflow_dispatch' || github.event_name == 'schedule' | |
| steps: | |
| - name: Pre-checkout cleanup — remove root-owned Docker volumes | |
| # Must run BEFORE checkout. Root-owned Milvus/MinIO/etcd files from | |
| # prior runs block git clean. Docker (running as root inside the | |
| # container) removes them. We mount the PARENT dir and `rm` the | |
| # target by name — this works even when both the target AND its | |
| # parent are root-owned (e.g. when a trial agent's compose file | |
| # with a relative bind mount caused Docker to mkdir -p the chain | |
| # as root, leaving an empty root-owned dir that the ubuntu runner | |
| # can't rmdir because it lacks write on the root-owned parent). | |
| run: | | |
| WORKSPACE="${GITHUB_WORKSPACE:-/home/ubuntu/actions-runner/_work/rag/rag}" | |
| for vol_dir in deploy/compose/volumes ci/volumes ci/deploy/compose ci/deploy; do | |
| TARGET="$WORKSPACE/$vol_dir" | |
| if [ -e "$TARGET" ]; then | |
| PARENT="$(dirname "$TARGET")" | |
| NAME="$(basename "$TARGET")" | |
| echo "Cleaning $TARGET" | |
| docker run --rm -v "$PARENT:/parent" alpine \ | |
| sh -c "rm -rf \"/parent/$NAME\"" | |
| fi | |
| done | |
| - name: Checkout | |
| uses: actions/checkout@v5 | |
| with: | |
| fetch-depth: 0 | |
| - name: Extract PR number + base | |
| id: pr | |
| if: github.event_name == 'push' | |
| run: | | |
| REF="${{ github.ref_name }}" | |
| PR="${REF##pull-request/}" | |
| BASE=$(gh pr view "$PR" --json baseRefName --jq .baseRefName) | |
| echo "number=$PR" >> "$GITHUB_OUTPUT" | |
| echo "base=$BASE" >> "$GITHUB_OUTPUT" | |
| echo "PR #$PR, base=$BASE" | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| - name: Detect skills/ changes vs PR base | |
| id: changes | |
| if: github.event_name == 'push' | |
| uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d | |
| with: | |
| base: ${{ steps.pr.outputs.base }} | |
| filters: | | |
| skills: | |
| - 'skills/**' | |
| harness: | |
| - 'scripts/validate_skill_versions.py' | |
| - '.github/skill-eval/**' | |
| - '.github/workflows/skills-eval.yml' | |
| - 'ci/run_skill_eval.sh' | |
| - name: Authenticate Brev CLI (long-lived API key) | |
| # Mints fresh Brev auth at the start of every run so the workflow | |
| # does not depend on persistent ~/.brev/credentials.json state on | |
| # the self-hosted runner. Fails loudly upfront if the secret is | |
| # missing or wrong — instead of failing 4h later at cleanup. | |
| if: github.event_name != 'push' || steps.changes.outputs.skills == 'true' | |
| env: | |
| BREV_API_KEY: ${{ secrets.BREV_API_KEY }} | |
| BREV_ORG_ID: org-2wW9qP0o1LFbMHyE2UnAFPIil8H | |
| run: | | |
| if [ -z "$BREV_API_KEY" ]; then | |
| echo "::error::BREV_API_KEY secret not set" | |
| exit 1 | |
| fi | |
| brev login --api-key "$BREV_API_KEY" --org-id "$BREV_ORG_ID" | |
| if ! brev ls >/dev/null 2>&1; then | |
| echo "::error::Brev auth failed: brev ls returned non-zero after login" | |
| exit 1 | |
| fi | |
| echo "Brev auth OK" | |
| - name: Run skills eval agent | |
| id: agent | |
| if: github.event_name != 'push' || steps.changes.outputs.skills == 'true' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| GH_CONFIG_DIR: ${{ runner.temp }}/gh-skill-eval-${{ github.run_id }} | |
| INPUT_SKILLS: ${{ inputs.skills }} | |
| ANTHROPIC_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} | |
| ANTHROPIC_BASE_URL: https://inference-api.nvidia.com | |
| ANTHROPIC_MODEL: aws/anthropic/bedrock-claude-sonnet-4-6 | |
| NVIDIA_INFERENCE_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} | |
| NVIDIA_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} | |
| NGC_API_KEY: ${{ secrets.NGC_API_KEY }} | |
| JUDGE_ANTHROPIC_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }} | |
| JUDGE_FULL_MODEL: aws/anthropic/claude-haiku-4-5-v1 | |
| CLAUDE_CODE_DISABLE_THINKING: "1" | |
| TAG: latest | |
| run: | | |
| mkdir -p "$GH_CONFIG_DIR" /tmp/brev /tmp/skill-eval | |
| export PR_NUMBER="${{ steps.pr.outputs.number }}" | |
| export PR_BASE="${{ steps.pr.outputs.base }}" | |
| export PR_HEAD_SHA="${{ github.sha }}" | |
| export PR_REPO="${{ github.repository }}" | |
| export GITHUB_RUN_ID="${{ github.run_id }}" | |
| # PYTHONPATH lets uvx harbor resolve envs.*:*Environment from skill-eval/ | |
| export PYTHONPATH="${GITHUB_WORKSPACE}/skill-eval:${PYTHONPATH:-}" | |
| if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then | |
| export MANUAL_FULL_SWEEP=1 | |
| export MANUAL_SKILLS_FILTER="${INPUT_SKILLS}" | |
| fi | |
| python3 .github/skill-eval/skills_eval_agent.py | |
| - name: Collect results for artifact | |
| # Tar ONLY this run's subdir (/tmp/skill-eval/results/<run_id>/), not the whole | |
| # results tree. The runner's disk persists between jobs, so /tmp/skill-eval/results/ | |
| # may also contain leftover subdirs from prior runs. Archiving all of them would | |
| # re-upload stale data labeled as the current run — exactly how 06-03's results | |
| # ended up in every nightly artifact for 9 days. If the agent didn't write a | |
| # subdir for this run_id, there's nothing fresh to upload, so we exit cleanly. | |
| if: always() && (github.event_name != 'push' || steps.changes.outputs.skills == 'true') | |
| run: | | |
| RUN_DIR="/tmp/skill-eval/results/${{ github.run_id }}" | |
| if [ ! -d "$RUN_DIR" ]; then | |
| echo "no results subdir for this run (${{ github.run_id }}) — nothing to archive" | |
| exit 0 | |
| fi | |
| RESULTS=$(find "$RUN_DIR" -maxdepth 3 -name "result.json" 2>/dev/null | head -50 || true) | |
| if [ -n "$RESULTS" ]; then | |
| tar czf /tmp/skills-eval-results.tar.gz \ | |
| -C /tmp/skill-eval/results "${{ github.run_id }}" | |
| echo "archived $(echo "$RESULTS" | wc -l) result.json files for run ${{ github.run_id }}" | |
| else | |
| echo "results dir exists for this run but no result.json files — nothing to archive" | |
| fi | |
| - name: Upload results artifact | |
| if: always() && (github.event_name != 'push' || steps.changes.outputs.skills == 'true') | |
| uses: actions/upload-artifact@v5 | |
| with: | |
| name: >- | |
| ${{ github.event_name == 'schedule' | |
| && format('skills-eval-nightly-{0}', github.run_id) | |
| || github.event_name == 'workflow_dispatch' | |
| && format('skills-eval-manual-{0}', github.run_id) | |
| || format('skills-eval-pr-{0}-{1}', steps.pr.outputs.number, github.run_id) }} | |
| path: /tmp/skills-eval-results.tar.gz | |
| if-no-files-found: ignore | |
| retention-days: 7 | |
| - name: Delete GPU Brev VMs | |
| # Run on success AND failure — agent crashes are exactly when GPU | |
| # cleanup is most needed. Process substitution (not pipe-to-while) | |
| # so FAILED=1 inside the loop is visible after the loop exits. | |
| if: always() && (github.event_name != 'push' || steps.changes.outputs.skills == 'true') | |
| run: | | |
| RECORD="/tmp/brev/started-by-${{ github.run_id }}.txt" | |
| if [ ! -f "$RECORD" ]; then | |
| echo "No GPU VMs to clean up." | |
| exit 0 | |
| fi | |
| echo "Waiting 5 min cooldown before deleting VMs..." | |
| sleep 300 | |
| FAILED=0 | |
| while read -r INSTANCE; do | |
| [ -z "$INSTANCE" ] && continue | |
| echo "Deleting Brev VM: $INSTANCE" | |
| if brev delete "$INSTANCE"; then | |
| echo "OK Deleted $INSTANCE" | |
| else | |
| echo "::error::Failed to delete $INSTANCE (exit $?)" | |
| FAILED=1 | |
| fi | |
| done < <(sort -u "$RECORD") | |
| exit $FAILED | |
| - name: Skip note when no skills/ changes | |
| if: github.event_name == 'push' && steps.changes.outputs.skills != 'true' | |
| run: echo "::notice::No skills/ changes in this PR; eval skipped." |