Skills Eval #108

Workflow file for this run

.github/workflows/skills-eval.yml at 3241f06

	# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0

	# Tier 3 Harbor eval for rag-* skills.
	# Three triggers:
	# 1. push to pull-request/* → diff-based: only eval skills changed in the PR
	# 2. schedule (nightly) → all rag-* cpu skills
	# 3. workflow_dispatch → manual trigger with skill selector
	#
	# Note: cve-fix skill requires NSPect (NVIDIA internal tool) — GitHub Actions
	# runners cannot reach it (not security approved). cve-fix runs via GitLab CI
	# on the mirror repo (gitlab-master.nvidia.com/chat-labs/OpenSource/rag).
	#
	# Uses dorny/paths-filter for cumulative PR diff (not per-push) — same
	# reason as skills-eval.yml. copy-pr-bot merge commits don't always
	# touch skills/ even when the PR does.

	name: Skills Eval

	on:
	push:
	branches:
	- "pull-request/[0-9]+"
	schedule:
	- cron: "0 2 * * " # 2am UTC nightly — all rag- cpu skills
	workflow_dispatch:
	inputs:
	skills:
	description: "Skill to eval (* for all, or pick one)"
	required: false
	default: "*"
	type: choice
	options:
	- "*"
	- rag-deploy-blueprint
	- rag-ingest-documents
	- rag-query-knowledge
	- rag-configure-infrastructure
	- rag-configure-retrieval
	- rag-evaluate-quality
	- rag-manage-mcp
	- rag-troubleshoot-blueprint
	- rag-enable-vlm
	- rag-enable-guardrails

	permissions:
	contents: write
	pull-requests: write

	concurrency:
	group: skills-eval-${{ github.ref }}
	cancel-in-progress: true

	defaults:
	run:
	shell: bash

	jobs:
	eval:
	name: Eval changed skills against PR
	runs-on: [self-hosted, rag-eval]

	# 4-hour cap: 8 cpu skills × ~15 min each = 2h max with 1.5x timeouts.
	# Nightly runs all skills; PR runs only changed ones so usually much faster.
	timeout-minutes: 240

	if: startsWith(github.ref, 'refs/heads/pull-request/') \|\| github.event_name == 'workflow_dispatch' \|\| github.event_name == 'schedule'

	steps:
	- name: Pre-checkout cleanup — remove root-owned Docker volumes
	# Must run BEFORE checkout. Root-owned Milvus/MinIO/etcd files from
	# prior runs block git clean. Docker (running as root inside the
	# container) removes them. We mount the PARENT dir and `rm` the
	# target by name — this works even when both the target AND its
	# parent are root-owned (e.g. when a trial agent's compose file
	# with a relative bind mount caused Docker to mkdir -p the chain
	# as root, leaving an empty root-owned dir that the ubuntu runner
	# can't rmdir because it lacks write on the root-owned parent).
	run: \|
	WORKSPACE="${GITHUB_WORKSPACE:-/home/ubuntu/actions-runner/_work/rag/rag}"
	for vol_dir in deploy/compose/volumes ci/volumes ci/deploy/compose ci/deploy; do
	TARGET="$WORKSPACE/$vol_dir"
	if [ -e "$TARGET" ]; then
	PARENT="$(dirname "$TARGET")"
	NAME="$(basename "$TARGET")"
	echo "Cleaning $TARGET"
	docker run --rm -v "$PARENT:/parent" alpine \
	sh -c "rm -rf \"/parent/$NAME\""
	fi
	done

	- name: Checkout
	uses: actions/checkout@v5
	with:
	fetch-depth: 0

	- name: Extract PR number + base
	id: pr
	if: github.event_name == 'push'
	run: \|
	REF="${{ github.ref_name }}"
	PR="${REF##pull-request/}"
	BASE=$(gh pr view "$PR" --json baseRefName --jq .baseRefName)
	echo "number=$PR" >> "$GITHUB_OUTPUT"
	echo "base=$BASE" >> "$GITHUB_OUTPUT"
	echo "PR #$PR, base=$BASE"
	env:
	GH_TOKEN: ${{ github.token }}

	- name: Detect skills/ changes vs PR base
	id: changes
	if: github.event_name == 'push'
	uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d
	with:
	base: ${{ steps.pr.outputs.base }}
	filters: \|
	skills:
	- 'skills/**'
	harness:
	- 'scripts/validate_skill_versions.py'
	- '.github/skill-eval/**'
	- '.github/workflows/skills-eval.yml'
	- 'ci/run_skill_eval.sh'

	- name: Authenticate Brev CLI (long-lived API key)
	# Mints fresh Brev auth at the start of every run so the workflow
	# does not depend on persistent ~/.brev/credentials.json state on
	# the self-hosted runner. Fails loudly upfront if the secret is
	# missing or wrong — instead of failing 4h later at cleanup.
	if: github.event_name != 'push' \|\| steps.changes.outputs.skills == 'true'
	env:
	BREV_API_KEY: ${{ secrets.BREV_API_KEY }}
	BREV_ORG_ID: org-2wW9qP0o1LFbMHyE2UnAFPIil8H
	run: \|
	if [ -z "$BREV_API_KEY" ]; then
	echo "::error::BREV_API_KEY secret not set"
	exit 1
	fi
	brev login --api-key "$BREV_API_KEY" --org-id "$BREV_ORG_ID"
	if ! brev ls >/dev/null 2>&1; then
	echo "::error::Brev auth failed: brev ls returned non-zero after login"
	exit 1
	fi
	echo "Brev auth OK"

	- name: Run skills eval agent
	id: agent
	if: github.event_name != 'push' \|\| steps.changes.outputs.skills == 'true'
	env:
	GH_TOKEN: ${{ github.token }}
	GH_CONFIG_DIR: ${{ runner.temp }}/gh-skill-eval-${{ github.run_id }}
	INPUT_SKILLS: ${{ inputs.skills }}
	ANTHROPIC_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }}
	ANTHROPIC_BASE_URL: https://inference-api.nvidia.com
	ANTHROPIC_MODEL: aws/anthropic/bedrock-claude-sonnet-4-6
	NVIDIA_INFERENCE_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }}
	NVIDIA_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }}
	NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
	JUDGE_ANTHROPIC_API_KEY: ${{ secrets.NVBASE_INFERENCE_API_KEY }}
	JUDGE_FULL_MODEL: aws/anthropic/claude-haiku-4-5-v1
	CLAUDE_CODE_DISABLE_THINKING: "1"
	TAG: latest
	run: \|
	mkdir -p "$GH_CONFIG_DIR" /tmp/brev /tmp/skill-eval
	export PR_NUMBER="${{ steps.pr.outputs.number }}"
	export PR_BASE="${{ steps.pr.outputs.base }}"
	export PR_HEAD_SHA="${{ github.sha }}"
	export PR_REPO="${{ github.repository }}"
	export GITHUB_RUN_ID="${{ github.run_id }}"
	# PYTHONPATH lets uvx harbor resolve envs.:Environment from skill-eval/
	export PYTHONPATH="${GITHUB_WORKSPACE}/skill-eval:${PYTHONPATH:-}"
	if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
	export MANUAL_FULL_SWEEP=1
	export MANUAL_SKILLS_FILTER="${INPUT_SKILLS}"
	fi
	python3 .github/skill-eval/skills_eval_agent.py

	- name: Collect results for artifact
	# Tar ONLY this run's subdir (/tmp/skill-eval/results/<run_id>/), not the whole
	# results tree. The runner's disk persists between jobs, so /tmp/skill-eval/results/
	# may also contain leftover subdirs from prior runs. Archiving all of them would
	# re-upload stale data labeled as the current run — exactly how 06-03's results
	# ended up in every nightly artifact for 9 days. If the agent didn't write a
	# subdir for this run_id, there's nothing fresh to upload, so we exit cleanly.
	if: always() && (github.event_name != 'push' \|\| steps.changes.outputs.skills == 'true')
	run: \|
	RUN_DIR="/tmp/skill-eval/results/${{ github.run_id }}"
	if [ ! -d "$RUN_DIR" ]; then
	echo "no results subdir for this run (${{ github.run_id }}) — nothing to archive"
	exit 0
	fi
	RESULTS=$(find "$RUN_DIR" -maxdepth 3 -name "result.json" 2>/dev/null \| head -50 \|\| true)
	if [ -n "$RESULTS" ]; then
	tar czf /tmp/skills-eval-results.tar.gz \
	-C /tmp/skill-eval/results "${{ github.run_id }}"
	echo "archived $(echo "$RESULTS" \| wc -l) result.json files for run ${{ github.run_id }}"
	else
	echo "results dir exists for this run but no result.json files — nothing to archive"
	fi

	- name: Upload results artifact
	if: always() && (github.event_name != 'push' \|\| steps.changes.outputs.skills == 'true')
	uses: actions/upload-artifact@v5
	with:
	name: >-
	${{ github.event_name == 'schedule'
	&& format('skills-eval-nightly-{0}', github.run_id)
	\|\| github.event_name == 'workflow_dispatch'
	&& format('skills-eval-manual-{0}', github.run_id)
	\|\| format('skills-eval-pr-{0}-{1}', steps.pr.outputs.number, github.run_id) }}
	path: /tmp/skills-eval-results.tar.gz
	if-no-files-found: ignore
	retention-days: 7

	- name: Delete GPU Brev VMs
	# Run on success AND failure — agent crashes are exactly when GPU
	# cleanup is most needed. Process substitution (not pipe-to-while)
	# so FAILED=1 inside the loop is visible after the loop exits.
	if: always() && (github.event_name != 'push' \|\| steps.changes.outputs.skills == 'true')
	run: \|
	RECORD="/tmp/brev/started-by-${{ github.run_id }}.txt"
	if [ ! -f "$RECORD" ]; then
	echo "No GPU VMs to clean up."
	exit 0
	fi
	echo "Waiting 5 min cooldown before deleting VMs..."
	sleep 300
	FAILED=0
	while read -r INSTANCE; do
	[ -z "$INSTANCE" ] && continue
	echo "Deleting Brev VM: $INSTANCE"
	if brev delete "$INSTANCE"; then
	echo "OK Deleted $INSTANCE"
	else
	echo "::error::Failed to delete $INSTANCE (exit $?)"
	FAILED=1
	fi
	done < <(sort -u "$RECORD")
	exit $FAILED

	- name: Skip note when no skills/ changes
	if: github.event_name == 'push' && steps.changes.outputs.skills != 'true'
	run: echo "::notice::No skills/ changes in this PR; eval skipped."

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Skills Eval #108

Workflow file

Skills Eval #108

Uh oh!

Workflow file for this run