Update Benchmark Data #5494

Workflow file for this run

.github/workflows/update-benchmarks.yml at b4e10c1

	name: Update Benchmark Data

	on:
	workflow_dispatch:
	inputs:
	refresh_all:
	description: "Bypass the twice-daily gate and regenerate Arena + Epoch this run"
	type: boolean
	default: false

	permissions:
	contents: write

	env:
	FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true

	jobs:
	update-benchmarks:
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v4

	- name: Install Rust
	uses: dtolnay/rust-toolchain@stable

	- name: Cache Rust build
	uses: Swatinem/rust-cache@v2

	# Arena + Epoch update daily upstream and Epoch's ZIP is multi-MB, so they
	# are gated to twice daily: the :17 run (minute < 30) at UTC hour 06 or 18.
	# The workflow_dispatch `refresh_all` boolean bypasses the gate so the
	# first post-merge dispatch can bootstrap all four files at once.
	- name: Decide whether to refresh Arena + Epoch
	id: gate
	run: \|
	hour=$(date -u +%H)
	minute=$(date -u +%M)
	refresh_all="${{ github.event.inputs.refresh_all }}"
	do_heavy=false
	if [ "$refresh_all" = "true" ]; then
	do_heavy=true
	echo "refresh_all=true -> regenerating Arena + Epoch"
	elif { [ "$hour" = "06" ] \|\| [ "$hour" = "18" ]; } && [ "$minute" -lt 30 ]; then
	do_heavy=true
	echo "twice-daily gate open (UTC ${hour}:${minute}) -> regenerating Arena + Epoch"
	else
	echo "twice-daily gate closed (UTC ${hour}:${minute}) -> leaving Arena + Epoch files in place"
	fi
	echo "do_heavy=$do_heavy" >> "$GITHUB_OUTPUT"

	# --- AA: runs every trigger (legacy lane + v2) ------------------------
	- name: Fetch benchmark data from Artificial Analysis API
	id: aa
	continue-on-error: true
	env:
	AA_API_KEY: ${{ secrets.AA_API_KEY }}
	run: \|
	curl -sf -H "X-API-Key: $AA_API_KEY" \
	https://artificialanalysis.ai/api/v2/data/llms/models \
	> aa_raw.json

	jq '[.data[] \| {
	id: .id,
	name: .name,
	slug: .slug,
	creator: (.model_creator?.slug // null),
	creator_id: (.model_creator?.id // null),
	creator_name: (.model_creator?.name // null),
	release_date: .release_date,
	intelligence_index: (.evaluations?.artificial_analysis_intelligence_index // null),
	coding_index: (.evaluations?.artificial_analysis_coding_index // null),
	math_index: (.evaluations?.artificial_analysis_math_index // null),
	mmlu_pro: (.evaluations?.mmlu_pro // null),
	gpqa: (.evaluations?.gpqa // null),
	hle: (.evaluations?.hle // null),
	livecodebench: (.evaluations?.livecodebench // null),
	scicode: (.evaluations?.scicode // null),
	ifbench: (.evaluations?.ifbench // null),
	lcr: (.evaluations?.lcr // null),
	terminalbench_hard: (.evaluations?.terminalbench_hard // null),
	tau2: (.evaluations?.tau2 // null),
	math_500: (.evaluations?.math_500 // null),
	aime: (.evaluations?.aime // null),
	aime_25: (.evaluations?.aime_25 // null),
	output_tps: (.median_output_tokens_per_second \| if . == 0 then null else . end),
	ttft: (.median_time_to_first_token_seconds \| if . == 0 then null else . end),
	ttfat: (.median_time_to_first_answer_token \| if . == 0 then null else . end),
	price_input: (.pricing?.price_1m_input_tokens // null),
	price_output: (.pricing?.price_1m_output_tokens // null),
	price_blended: (.pricing?.price_1m_blended_3_to_1 // null)
	}]' aa_raw.json > data/benchmarks.json

	echo "Fetched $(jq length data/benchmarks.json) entries"

	- name: Generate v2 AA source file
	if: steps.aa.outcome == 'success'
	continue-on-error: true
	run: \|
	mkdir -p data/v2
	cargo run --features pipeline --bin transform -- aa aa_raw.json -o data/v2/aa.json

	# --- LLM Stats: runs every trigger (bounded paged fetch) --------------
	# `/v1/rankings` is hard-capped at limit=50 (limit>50 silently returns an
	# empty list); `/v1/models` is cursor-paginated. Both loops are bounded.
	- name: Fetch + generate v2 LLM Stats source file
	id: llmstats
	continue-on-error: true
	env:
	LLM_STATS_API_KEY: ${{ secrets.LLM_STATS_API_KEY }}
	run: \|
	set -euo pipefail
	base="https://api.llm-stats.com/stats/v1"
	auth="Authorization: Bearer $LLM_STATS_API_KEY"

	# 11 curated categories -> one RankingsResponse each, assembled into
	# { "rankings": [ ... ] }.
	cats="agents code finance frontend_development general healthcare legal math multimodal reasoning vision"
	: > rankings_parts.json
	for cat in $cats; do
	curl -sf -H "$auth" "$base/rankings?category=$cat&limit=50" >> rankings_parts.json
	echo >> rankings_parts.json
	done
	jq -s '{ rankings: . }' rankings_parts.json > llmstats_rankings.json

	# /v1/models, cursor-paginated, bounded to <10 pages -> { "models": [...] }.
	: > models_parts.json
	cursor=""
	page=0
	while [ "$page" -lt 10 ]; do
	if [ -z "$cursor" ]; then
	resp=$(curl -sf -H "$auth" "$base/models?limit=50")
	else
	resp=$(curl -sf -H "$auth" "$base/models?limit=50&cursor=$cursor")
	fi
	echo "$resp" \| jq -c '.models[]?' >> models_parts.json
	cursor=$(echo "$resp" \| jq -r '.next_cursor // empty')
	page=$((page + 1))
	[ -z "$cursor" ] && break
	done
	jq -s '{ models: . }' models_parts.json > llmstats_models.json

	mkdir -p data/v2
	cargo run --features pipeline --bin transform -- \
	llmstats llmstats_rankings.json --models llmstats_models.json -o data/v2/llmstats.json

	# --- Arena: twice-daily gated ----------------------------------------
	- name: Fetch + generate v2 Arena source file
	id: arena
	if: steps.gate.outputs.do_heavy == 'true'
	continue-on-error: true
	run: \|
	set -euo pipefail
	repo="https://raw.githubusercontent.com/oolong-tea-2026/arena-ai-leaderboards/main"

	# latest.json points at the newest snapshot directory ({path, date}).
	snapshot=$(curl -sf "$repo/data/latest.json" \| jq -r '.path // .date // empty')
	if [ -z "$snapshot" ]; then
	echo "could not resolve Arena snapshot pointer" >&2
	exit 1
	fi
	echo "Arena snapshot: $snapshot"

	mkdir -p arena_boards
	for board in text vision code agent search document; do
	curl -sf "$repo/data/$snapshot/$board.json" -o "arena_boards/$board.json" \|\| \
	echo "warning: Arena board $board missing in snapshot $snapshot" >&2
	done

	mkdir -p data/v2
	cargo run --features pipeline --bin transform -- arena arena_boards -o data/v2/arena.json

	# --- Epoch: twice-daily gated ----------------------------------------
	- name: Fetch + generate v2 Epoch source file
	id: epoch
	if: steps.gate.outputs.do_heavy == 'true'
	continue-on-error: true
	run: \|
	set -euo pipefail
	tmp=$(mktemp -d)
	curl -sf "https://epoch.ai/data/benchmark_data.zip" -o "$tmp/benchmark_data.zip"
	unzip -q -o "$tmp/benchmark_data.zip" -d "$tmp/csv"

	# The ZIP may unpack into a nested directory; transform reads a dir of
	# CSVs, so locate the directory actually containing the .csv files.
	csv_dir="$tmp/csv"
	if [ -z "$(find "$csv_dir" -maxdepth 1 -name '*.csv' -print -quit)" ]; then
	csv_dir=$(dirname "$(find "$tmp/csv" -name '*.csv' -print -quit)")
	fi

	mkdir -p data/v2
	cargo run --features pipeline --bin transform -- epoch "$csv_dir" -o data/v2/epoch.json

	# --- Commit + purge (failure-isolated: each file staged best-effort) ---
	- name: Commit and push if changed
	id: commit
	run: \|
	git add data/benchmarks.json data/v2/ \|\| true
	git diff --cached --quiet && echo "changed=false" >> "$GITHUB_OUTPUT" && exit 0
	git config user.name "github-actions[bot]"
	git config user.email "github-actions[bot]@users.noreply.github.com"
	git commit -m "chore: update benchmark data"
	git push
	echo "changed=true" >> "$GITHUB_OUTPUT"

	- name: Purge jsDelivr cache
	if: steps.commit.outputs.changed == 'true'
	run: \|
	# jsDelivr aliased (@main) URLs can serve stale content "up to 7 days"
	# without a successful purge, and its purge endpoint intermittently
	# returns a 200 body of "no available server" (observed 2026-06-16/17,
	# tail of the branch-resolution outage). curl can't auto-retry a 200,
	# so retry with backoff on that body. Non-fatal: a failed purge just
	# leaves the CDN stale until the next run re-purges the changed file.
	purge() {
	url="$1"
	for attempt in 1 2 3 4 5; do
	resp=$(curl -s --max-time 20 "$url" \|\| true)
	case "$resp" in
	"no available server"\|"")
	echo "purge attempt $attempt failed for $url: ${resp:-<empty>}"
	sleep $((attempt * 5))
	;;
	*)
	echo "purged $url: $resp"
	return 0
	;;
	esac
	done
	echo "::warning::jsDelivr purge failed after retries for $url"
	return 0
	}
	for f in \
	data/benchmarks.json \
	data/v2/aa.json \
	data/v2/epoch.json \
	data/v2/arena.json \
	data/v2/llmstats.json; do
	purge "https://purge.jsdelivr.net/gh/reyamira/models@main/$f"
	done

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Update Benchmark Data #5494

Workflow file

Update Benchmark Data #5494

Uh oh!

Workflow file for this run