Skip to content

Update Benchmark Data #5418

Update Benchmark Data

Update Benchmark Data #5418

name: Update Benchmark Data
on:
workflow_dispatch:
inputs:
refresh_all:
description: "Bypass the twice-daily gate and regenerate Arena + Epoch this run"
type: boolean
default: false
permissions:
contents: write
env:
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
jobs:
update-benchmarks:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Rust
uses: dtolnay/rust-toolchain@stable
- name: Cache Rust build
uses: Swatinem/rust-cache@v2
# Arena + Epoch update daily upstream and Epoch's ZIP is multi-MB, so they
# are gated to twice daily: the :17 run (minute < 30) at UTC hour 06 or 18.
# The workflow_dispatch `refresh_all` boolean bypasses the gate so the
# first post-merge dispatch can bootstrap all four files at once.
- name: Decide whether to refresh Arena + Epoch
id: gate
run: |
hour=$(date -u +%H)
minute=$(date -u +%M)
refresh_all="${{ github.event.inputs.refresh_all }}"
do_heavy=false
if [ "$refresh_all" = "true" ]; then
do_heavy=true
echo "refresh_all=true -> regenerating Arena + Epoch"
elif { [ "$hour" = "06" ] || [ "$hour" = "18" ]; } && [ "$minute" -lt 30 ]; then
do_heavy=true
echo "twice-daily gate open (UTC ${hour}:${minute}) -> regenerating Arena + Epoch"
else
echo "twice-daily gate closed (UTC ${hour}:${minute}) -> leaving Arena + Epoch files in place"
fi
echo "do_heavy=$do_heavy" >> "$GITHUB_OUTPUT"
# --- AA: runs every trigger (legacy lane + v2) ------------------------
- name: Fetch benchmark data from Artificial Analysis API
id: aa
continue-on-error: true
env:
AA_API_KEY: ${{ secrets.AA_API_KEY }}
run: |
curl -sf -H "X-API-Key: $AA_API_KEY" \
https://artificialanalysis.ai/api/v2/data/llms/models \
> aa_raw.json
jq '[.data[] | {
id: .id,
name: .name,
slug: .slug,
creator: (.model_creator?.slug // null),
creator_id: (.model_creator?.id // null),
creator_name: (.model_creator?.name // null),
release_date: .release_date,
intelligence_index: (.evaluations?.artificial_analysis_intelligence_index // null),
coding_index: (.evaluations?.artificial_analysis_coding_index // null),
math_index: (.evaluations?.artificial_analysis_math_index // null),
mmlu_pro: (.evaluations?.mmlu_pro // null),
gpqa: (.evaluations?.gpqa // null),
hle: (.evaluations?.hle // null),
livecodebench: (.evaluations?.livecodebench // null),
scicode: (.evaluations?.scicode // null),
ifbench: (.evaluations?.ifbench // null),
lcr: (.evaluations?.lcr // null),
terminalbench_hard: (.evaluations?.terminalbench_hard // null),
tau2: (.evaluations?.tau2 // null),
math_500: (.evaluations?.math_500 // null),
aime: (.evaluations?.aime // null),
aime_25: (.evaluations?.aime_25 // null),
output_tps: (.median_output_tokens_per_second | if . == 0 then null else . end),
ttft: (.median_time_to_first_token_seconds | if . == 0 then null else . end),
ttfat: (.median_time_to_first_answer_token | if . == 0 then null else . end),
price_input: (.pricing?.price_1m_input_tokens // null),
price_output: (.pricing?.price_1m_output_tokens // null),
price_blended: (.pricing?.price_1m_blended_3_to_1 // null)
}]' aa_raw.json > data/benchmarks.json
echo "Fetched $(jq length data/benchmarks.json) entries"
- name: Generate v2 AA source file
if: steps.aa.outcome == 'success'
continue-on-error: true
run: |
mkdir -p data/v2
cargo run --features pipeline --bin transform -- aa aa_raw.json -o data/v2/aa.json
# --- LLM Stats: runs every trigger (bounded paged fetch) --------------
# `/v1/rankings` is hard-capped at limit=50 (limit>50 silently returns an
# empty list); `/v1/models` is cursor-paginated. Both loops are bounded.
- name: Fetch + generate v2 LLM Stats source file
id: llmstats
continue-on-error: true
env:
LLM_STATS_API_KEY: ${{ secrets.LLM_STATS_API_KEY }}
run: |
set -euo pipefail
base="https://api.llm-stats.com/stats/v1"
auth="Authorization: Bearer $LLM_STATS_API_KEY"
# 11 curated categories -> one RankingsResponse each, assembled into
# { "rankings": [ ... ] }.
cats="agents code finance frontend_development general healthcare legal math multimodal reasoning vision"
: > rankings_parts.json
for cat in $cats; do
curl -sf -H "$auth" "$base/rankings?category=$cat&limit=50" >> rankings_parts.json
echo >> rankings_parts.json
done
jq -s '{ rankings: . }' rankings_parts.json > llmstats_rankings.json
# /v1/models, cursor-paginated, bounded to <10 pages -> { "models": [...] }.
: > models_parts.json
cursor=""
page=0
while [ "$page" -lt 10 ]; do
if [ -z "$cursor" ]; then
resp=$(curl -sf -H "$auth" "$base/models?limit=50")
else
resp=$(curl -sf -H "$auth" "$base/models?limit=50&cursor=$cursor")
fi
echo "$resp" | jq -c '.models[]?' >> models_parts.json
cursor=$(echo "$resp" | jq -r '.next_cursor // empty')
page=$((page + 1))
[ -z "$cursor" ] && break
done
jq -s '{ models: . }' models_parts.json > llmstats_models.json
mkdir -p data/v2
cargo run --features pipeline --bin transform -- \
llmstats llmstats_rankings.json --models llmstats_models.json -o data/v2/llmstats.json
# --- Arena: twice-daily gated ----------------------------------------
- name: Fetch + generate v2 Arena source file
id: arena
if: steps.gate.outputs.do_heavy == 'true'
continue-on-error: true
run: |
set -euo pipefail
repo="https://raw.githubusercontent.com/oolong-tea-2026/arena-ai-leaderboards/main"
# latest.json points at the newest snapshot directory ({path, date}).
snapshot=$(curl -sf "$repo/data/latest.json" | jq -r '.path // .date // empty')
if [ -z "$snapshot" ]; then
echo "could not resolve Arena snapshot pointer" >&2
exit 1
fi
echo "Arena snapshot: $snapshot"
mkdir -p arena_boards
for board in text vision code agent search document; do
curl -sf "$repo/data/$snapshot/$board.json" -o "arena_boards/$board.json" || \
echo "warning: Arena board $board missing in snapshot $snapshot" >&2
done
mkdir -p data/v2
cargo run --features pipeline --bin transform -- arena arena_boards -o data/v2/arena.json
# --- Epoch: twice-daily gated ----------------------------------------
- name: Fetch + generate v2 Epoch source file
id: epoch
if: steps.gate.outputs.do_heavy == 'true'
continue-on-error: true
run: |
set -euo pipefail
tmp=$(mktemp -d)
curl -sf "https://epoch.ai/data/benchmark_data.zip" -o "$tmp/benchmark_data.zip"
unzip -q -o "$tmp/benchmark_data.zip" -d "$tmp/csv"
# The ZIP may unpack into a nested directory; transform reads a dir of
# CSVs, so locate the directory actually containing the .csv files.
csv_dir="$tmp/csv"
if [ -z "$(find "$csv_dir" -maxdepth 1 -name '*.csv' -print -quit)" ]; then
csv_dir=$(dirname "$(find "$tmp/csv" -name '*.csv' -print -quit)")
fi
mkdir -p data/v2
cargo run --features pipeline --bin transform -- epoch "$csv_dir" -o data/v2/epoch.json
# --- Commit + purge (failure-isolated: each file staged best-effort) ---
- name: Commit and push if changed
id: commit
run: |
git add data/benchmarks.json data/v2/ || true
git diff --cached --quiet && echo "changed=false" >> "$GITHUB_OUTPUT" && exit 0
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
git commit -m "chore: update benchmark data"
git push
echo "changed=true" >> "$GITHUB_OUTPUT"
- name: Purge jsDelivr cache
if: steps.commit.outputs.changed == 'true'
run: |
# jsDelivr aliased (@main) URLs can serve stale content "up to 7 days"
# without a successful purge, and its purge endpoint intermittently
# returns a 200 body of "no available server" (observed 2026-06-16/17,
# tail of the branch-resolution outage). curl can't auto-retry a 200,
# so retry with backoff on that body. Non-fatal: a failed purge just
# leaves the CDN stale until the next run re-purges the changed file.
purge() {
url="$1"
for attempt in 1 2 3 4 5; do
resp=$(curl -s --max-time 20 "$url" || true)
case "$resp" in
*"no available server"*|"")
echo "purge attempt $attempt failed for $url: ${resp:-<empty>}"
sleep $((attempt * 5))
;;
*)
echo "purged $url: $resp"
return 0
;;
esac
done
echo "::warning::jsDelivr purge failed after retries for $url"
return 0
}
for f in \
data/benchmarks.json \
data/v2/aa.json \
data/v2/epoch.json \
data/v2/arena.json \
data/v2/llmstats.json; do
purge "https://purge.jsdelivr.net/gh/reyamira/models@main/$f"
done