Skip to content

perf(decode): straight-loop short path + donor-gated lookahead ring + SeqSymbol repack #1473

perf(decode): straight-loop short path + donor-gated lookahead ring + SeqSymbol repack

perf(decode): straight-loop short path + donor-gated lookahead ring + SeqSymbol repack #1473

Workflow file for this run

name: CI
on:
push:
branches: [main]
# Docs-only / dashboard-only commits don't touch code, tests, or
# bench inputs — no reason to spin up the full lint → test (3 OS) →
# cross-i686 → msrv → codecov → fuzz → 27-shard bench pipeline.
# The dashboard's `index.html` republish path lives in
# `pages-only.yml`, which is gated on these same paths inversely.
paths-ignore:
- '**.md'
- '.github/bench-dashboard/**'
- 'docs/**'
- 'LICENSE*'
- '.gitignore'
pull_request:
branches: [main]
paths-ignore:
- '**.md'
- '.github/bench-dashboard/**'
- 'docs/**'
- 'LICENSE*'
- '.gitignore'
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
# Keep cancellation for PR churn, but never cancel main pushes mid-flight:
# benchmark baseline + gh-pages merge must complete atomically.
cancel-in-progress: ${{ !(github.event_name == 'push' && github.ref == 'refs/heads/main') }}
env:
CARGO_TERM_COLOR: always
jobs:
lint:
timeout-minutes: 10
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
with:
components: rustfmt, clippy
- uses: Swatinem/rust-cache@v2
- name: Format
run: cargo fmt --all -- --check
- name: Clippy
run: cargo clippy -p structured-zstd --features hash,std,dict_builder -- -D warnings
- name: Clippy (bench_internals)
run: cargo clippy -p structured-zstd --features hash,std,dict_builder,bench_internals --benches -- -D warnings
- name: Gate — compare_ffi/memory must not pull bench_internals
# Rationale: both `compare_ffi` (timing) and `compare_ffi_memory`
# (peak alloc) benchmark structured-zstd against libzstd. If
# `bench_internals` widens our crate's public API surface
# (visibility changes for `BitReaderReversed` etc.), the Rust
# side ships compiled differently than what a real downstream
# consumer uses — biasing every cross-side comparison. Keep
# bench_internals strictly out of both benches' feature sets.
#
# tomllib (Python 3.11+ stdlib) handles both single-line and
# multi-line TOML arrays, so a future `required-features` array
# reformat can't sneak `bench_internals` past the gate.
run: |
python3 - <<'PY'
import sys, tomllib
with open("zstd/Cargo.toml", "rb") as f:
cargo = tomllib.load(f)
benches = {
bench.get("name"): bench.get("required-features", [])
for bench in cargo.get("bench", [])
}
violators = [
name for name in ("compare_ffi", "compare_ffi_memory")
if "bench_internals" in benches.get(name, [])
]
if violators:
for name in violators:
print(f"::error::{name} must NOT require bench_internals — would bias Rust-vs-FFI parity")
sys.exit(1)
PY
- name: Gate — bench instrumentation must not leak into zstd/src/
# Rationale: bench-only memory observation (TrackingAllocator,
# customMem hooks) lives in zstd/benches/. Anything
# bench-instrumentation-shaped in zstd/src/ would bloat the
# published crate. Comments are OK; identifier references in
# actual code are not.
#
# The second `rg -v` filters out comment-only lines so a doc
# comment referencing these names (e.g. "see bench's
# `TrackingAllocator`") doesn't trip the gate. Matches
# `path:line:` followed by leading whitespace and a Rust
# comment prefix (`//`, `///`, `//!`, `/*`, ` *`).
run: |
leaked=$(rg -n --no-heading \
'TrackingAllocator|ALLOC_PEAK|ALLOC_CURRENT|TRACKING_ENABLED|ZSTD_customMem|customMem\(' \
zstd/src/ \
| rg -v '^[^:]+:[0-9]+:\s*(//|/\*|\*)' || true)
if [ -n "$leaked" ]; then
echo "$leaked"
echo "::error::bench-only instrumentation symbols leaked into zstd/src/"
exit 1
fi
test:
needs: lint
timeout-minutes: 15
strategy:
# Each OS hits its own runner-image regressions (macos-latest currently
# ships Homebrew rustup without `cargo`/`rustc` shims under
# `~/.cargo/bin`, so plain `cargo …` resolves to `rustup-init`). Run
# every OS so one glitchy image doesn't mask the others.
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
runs-on: ${{ matrix.os }}
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- name: Prepend toolchain bin to PATH (macos shim workaround)
if: runner.os == 'macOS'
# macos-latest preinstalled rustup ships via Homebrew without the
# `~/.cargo/bin/{cargo,rustc}` proxy shims. `rustup run stable cargo`
# works for the outer call but the cargo it launches then invokes
# `rustc -vV` through the same broken proxy. Putting the toolchain's
# actual `bin/` ahead on PATH gives every nested invocation the
# real binaries.
run: |
TC="$(rustup show active-toolchain | awk '{print $1}')"
echo "$HOME/.rustup/toolchains/$TC/bin" >> $GITHUB_PATH
- uses: taiki-e/install-action@nextest
- uses: Swatinem/rust-cache@v2
with:
prefix-key: ${{ runner.os }}-cargo
- name: Test
working-directory: zstd
run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder
- name: Doc tests
run: cargo test --doc -p structured-zstd --features hash,std,dict_builder
cross-i686:
needs: lint
timeout-minutes: 15
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- uses: taiki-e/install-action@nextest
- name: Install i686 target
run: rustup target add i686-unknown-linux-gnu
- name: Install 32-bit libs
run: sudo apt-get update && sudo apt-get install -y gcc-multilib
- uses: Swatinem/rust-cache@v2
with:
prefix-key: cross-i686
- name: Test (i686)
working-directory: zstd
run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder --target i686-unknown-linux-gnu
msrv:
needs: lint
timeout-minutes: 15
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
with:
toolchain: "1.92.0"
- uses: taiki-e/install-action@nextest
- uses: Swatinem/rust-cache@v2
with:
prefix-key: msrv
- name: Test (MSRV)
working-directory: zstd
run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder
no-std:
# Build-only smoke test that the crate compiles under no_std + alloc.
# The decoder ships #![no_std] with optional `std` feature gating the
# io::Read/Write impls + runtime CPUID detection; without this gate
# nothing catches a regression where a new `use std::...` slips into
# a hot decode path and silently breaks downstream embedded users.
# Two configurations cover the realistic deployment surface:
# alloc-only (zero features) and alloc + xxhash content checksum.
# Clippy runs cargo's compile checks plus linting, so a separate
# `cargo check` pass would just duplicate work.
needs: lint
timeout-minutes: 10
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
with:
components: clippy
- uses: Swatinem/rust-cache@v2
with:
prefix-key: no-std
- name: Clippy (no_std)
working-directory: zstd
run: cargo clippy -p structured-zstd --no-default-features -- -D warnings
- name: Clippy (no_std + hash)
working-directory: zstd
run: cargo clippy -p structured-zstd --no-default-features --features hash -- -D warnings
codecov:
needs: lint
timeout-minutes: 15
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@nightly
with:
components: llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- run: cargo llvm-cov -p structured-zstd --features hash,std,dict_builder --lcov --output-path lcov.info
working-directory: zstd
- uses: codecov/codecov-action@v6
with:
files: zstd/lcov.info
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
fuzz:
# Short-budget libFuzzer smoke run across all targets in a single
# runner. The asan+sancov sub-build of `structured-zstd` is the
# dominant cost (~2 min); running the five targets sequentially in
# one job amortises that build (cargo cache reuse drops targets
# 2..5 to a few seconds each) and halves the compute vs a per-
# target matrix. Wall-clock stays bounded by the bench matrix,
# which runs in parallel and takes far longer.
#
# The repo ships a regression corpus in `zstd/fuzz/artifacts/`;
# for every target cargo-fuzz replays that corpus first (any old
# crash that resurfaces fails the job), then runs
# `-max_total_time` seconds of fresh fuzzing on top.
name: Fuzz smoke
needs: lint
timeout-minutes: 20
runs-on: ubuntu-latest
env:
# Override `rust-toolchain.toml` (which pins stable) so `cargo fuzz`
# — which requires `-Z sanitizer` from nightly — gets a nightly
# compiler inside the fuzz crate sub-build.
RUSTUP_TOOLCHAIN: nightly
# Explicit target: the prebuilt `cargo-fuzz` binary installed via
# taiki-e/install-action is statically linked against musl and its
# `default_target()` probe picks `x86_64-unknown-linux-musl`,
# which fails because libFuzzer's AddressSanitizer cannot link
# against a static libc (`sanitizer is incompatible with
# statically linked libc`). Pinning the gnu target sidesteps the
# probe and matches the toolchain rustc actually has stdlib for.
FUZZ_TARGET_TRIPLE: x86_64-unknown-linux-gnu
# Single source of truth for the fuzz target inventory; both the
# corpus replay step and the fresh-fuzz step iterate over this.
FUZZ_TARGETS: "decode encode interop huff0 fse"
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@nightly
- uses: Swatinem/rust-cache@v2
with:
prefix-key: fuzz
workspaces: zstd/fuzz
- name: Install cargo-fuzz
uses: taiki-e/install-action@v2
with:
tool: cargo-fuzz
- name: Replay regression corpus
# Drive the existing crash artifacts through each target so
# any reintroduction of a previously-fixed bug fails CI on
# the same input that originally surfaced it.
working-directory: zstd/fuzz
run: |
for target in $FUZZ_TARGETS; do
if [ -d "artifacts/$target" ] && [ -n "$(ls artifacts/$target 2>/dev/null)" ]; then
echo "Replaying $(ls artifacts/$target | wc -l) regression artifacts for $target"
cargo fuzz run --target "$FUZZ_TARGET_TRIPLE" "$target" artifacts/"$target"/*
else
echo "No regression artifacts for $target — skipping replay"
fi
done
- name: Fuzz (90s per target)
working-directory: zstd/fuzz
# `-max_total_time` is libFuzzer's own time cap; on top of
# the GitHub Actions timeout-minutes it gives us a hard
# ceiling even if a target wedges in setup.
run: |
for target in $FUZZ_TARGETS; do
echo "::group::Fuzz $target"
cargo fuzz run --target "$FUZZ_TARGET_TRIPLE" "$target" -- -max_total_time=90 -timeout=30
echo "::endgroup::"
done
- name: Upload new crash artifacts on failure
if: failure()
uses: actions/upload-artifact@v7
with:
name: fuzz-artifacts
path: zstd/fuzz/artifacts/
if-no-files-found: ignore
retention-days: 14
bench-matrix:
# Canonical bench target inventory. Every downstream bench job
# (`bench-build`, `benchmark`, `benchmark-aggregate`,
# `benchmark-pages`) consumes this so a new target ID is added
# in exactly one place. `build_setup` runs only on `bench-build`;
# `runtime_setup` runs only on the bench shards (e.g. `i686-gnu`
# needs gcc-multilib's 32-bit loader at runtime, while `x86_64-musl`
# only needs `musl-tools` for the build).
#
# Release-plz PRs are version-bump-only (no source changes), so the
# entire bench pipeline (matrix → build → shards → aggregate →
# pages → regression) is gated out for them via this `if:` filter.
# The skip cascades through `needs:` so downstream bench jobs
# also stay green-skipped on those PRs. See #164.
name: Resolve bench target matrix
needs: lint
if: |
github.event_name != 'pull_request' ||
(github.event.pull_request.user.login != 'release-plz[bot]' &&
!startsWith(github.head_ref, 'release-plz-'))
runs-on: ubuntu-latest
outputs:
targets: ${{ steps.set.outputs.targets }}
ids_csv: ${{ steps.set.outputs.ids_csv }}
shards: ${{ steps.set.outputs.shards }}
shards_csv: ${{ steps.set.outputs.shards_csv }}
steps:
- id: set
env:
# Drives the shard plan below. On a `pull_request` we run
# only the two canonical levels (`level_3_dfast` = donor
# default, `level_22_btultra2` = max compression) bundled
# into a single shard per target — three shards total, cheap
# PR feedback. On a `push: main` (post-merge), one shard per
# strategy group runs — nine groups (fast split into
# `fast-neg` / `fast-pos`; lazy split into `lazy-lower` /
# `lazy-upper` to keep the worst-case per-shard wall under
# the 120-min CI cap) × three targets = 27 shards, so the
# published gh-pages snapshot keeps full coverage for the
# dashboard + tagged baselines (#164).
EVENT_NAME: ${{ github.event_name }}
run: |
cat > targets.json <<'EOF'
[
{
"id": "x86_64-gnu",
"target_triple": "x86_64-unknown-linux-gnu",
"build_setup": "",
"runtime_setup": "",
"timeout_minutes": 120
},
{
"id": "i686-gnu",
"target_triple": "i686-unknown-linux-gnu",
"build_setup": "sudo apt-get update && sudo apt-get install -y gcc-multilib libc6-dev-i386",
"runtime_setup": "sudo apt-get update && sudo apt-get install -y libc6-dev-i386",
"timeout_minutes": 120
},
{
"id": "x86_64-musl",
"target_triple": "x86_64-unknown-linux-musl",
"build_setup": "sudo apt-get update && sudo apt-get install -y musl-tools",
"runtime_setup": "",
"timeout_minutes": 120
}
]
EOF
targets_compact=$(jq -c . targets.json)
ids_csv=$(jq -r '[.[].id] | join(",")' targets.json)
echo "targets=$targets_compact" >> "$GITHUB_OUTPUT"
echo "ids_csv=$ids_csv" >> "$GITHUB_OUTPUT"
echo "Bench targets: $ids_csv"
# Shard plan: each entry runs a comma-separated set of
# levels through one bench binary invocation via the
# `STRUCTURED_ZSTD_BENCH_LEVEL_FILTER` env var. `id` drives
# the artifact name (`benchmark-shard-<target>-<id>`) and
# the per-file suffix in the markdown / JSON outputs.
#
# PR event = single shard covering the two canonical levels
# so reviewers see ratio + speed + memory deltas on the
# default-level path (level_3_dfast) and the max-compression
# path (level_22_btultra2) within minutes. Strategy groups
# mirror `clevels.h` + `StrategyTag::for_level` so an
# entire strategy's levels share a runner — keeps per-job
# build overhead amortised across the levels of that family.
if [ "$EVENT_NAME" = "pull_request" ]; then
cat > shards.json <<'EOF'
[
{
"id": "pr-canonical",
"levels": "level_3_dfast,level_22_btultra2"
}
]
EOF
else
cat > shards.json <<'EOF'
[
{
"id": "fast-neg",
"levels": "level_-7_fast,level_-6_fast,level_-5_fast,level_-4_fast,level_-3_fast"
},
{
"id": "fast-pos",
"levels": "level_-2_fast,level_-1_fast,level_1_fast"
},
{
"id": "dfast",
"levels": "level_2_dfast,level_3_dfast"
},
{
"id": "greedy",
"levels": "level_4_greedy"
},
{
"id": "lazy-lower",
"levels": "level_5_lazy,level_6_lazy,level_7_lazy,level_8_lazy,level_9_lazy"
},
{
"id": "lazy-upper",
"levels": "level_10_lazy,level_11_lazy,level_12_lazy,level_13_lazy,level_14_lazy,level_15_lazy"
},
{
"id": "btopt",
"levels": "level_16_btopt,level_17_btopt"
},
{
"id": "btultra",
"levels": "level_18_btultra,level_19_btultra"
},
{
"id": "btultra2",
"levels": "level_20_btultra2,level_21_btultra2,level_22_btultra2"
}
]
EOF
fi
shards_compact=$(jq -c . shards.json)
shards_csv=$(jq -r '[.[].id] | join(",")' shards.json)
echo "shards=$shards_compact" >> "$GITHUB_OUTPUT"
echo "shards_csv=$shards_csv" >> "$GITHUB_OUTPUT"
echo "Bench shards ($EVENT_NAME): $shards_csv"
bench-build:
# Build the criterion `compare_ffi` binary once per target. Every
# downstream bench shard (target × level) downloads the binary
# via `bench-binary-<target>` artifact and runs it directly — no
# rebuild per shard. Saves ~4-7 min on each of the 18 shard
# runners.
name: Build bench binary (${{ matrix.bench.id }})
needs: [lint, bench-matrix]
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
bench: ${{ fromJSON(needs.bench-matrix.outputs.targets) }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: dtolnay/rust-toolchain@stable
- name: Install benchmark target
run: rustup target add ${{ matrix.bench.target_triple }}
- name: Install build toolchain dependencies
if: matrix.bench.build_setup != ''
run: ${{ matrix.bench.build_setup }}
- uses: Swatinem/rust-cache@v2
with:
prefix-key: bench-${{ matrix.bench.id }}
- name: Build compare_ffi + compare_ffi_memory bench binaries
env:
CC_x86_64_unknown_linux_musl: musl-gcc
# The donor zstd-sys C library uses runtime feature detection
# (is_x86_feature_detected!-equivalent) so it transparently
# picks up BMI2/AVX2/etc. on the runner. Pure-Rust hot paths
# gate intrinsics on COMPILE-time cfg!(target_feature = ...)
# and the default rustc x86_64 target ships with SSE2 only.
# Without explicit target selection the bench compares
# "donor with full ISA" vs "us with SSE2 baseline" — not
# apples-to-apples.
#
# Use a DETERMINISTIC baseline (x86-64-v3 = BMI2 + AVX2 +
# everything in the Haswell ISA, the 2013+ x86_64 baseline)
# ONLY for x86_64 targets via target.<triple>.rustflags. NOT
# target-cpu=native: that picks whatever CPU the BUILD runner
# has, which (a) varies across github-runners, (b) crashes
# with SIGILL when a bench shard runner lacks features the
# build runner had, and (c) is meaningless for cross-compile
# targets like i686-unknown-linux-gnu.
#
# i686 / non-x86 / musl targets keep the default rustc
# baseline. Measured +8.5% on
# decompress/level_-1_fast/decodecorpus-z000033/c_stream on
# i9-9900K; the win comes from _bzhi_u64-backed
# mask_lower_bits in the FSE state-update hot path.
CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: "-C target-cpu=x86-64-v3"
CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_RUSTFLAGS: "-C target-cpu=x86-64-v3"
run: |
# `--no-run` builds without executing; `--message-format=json`
# exposes the resolved binary path in the build log so we
# can ship just the executable to shards. Building BOTH
# bench binaries in one cargo invocation reuses dependency
# compilation between them (zstd-sys, criterion, etc.).
cargo bench --bench compare_ffi --bench compare_ffi_memory \
-p structured-zstd --features dict_builder \
--target ${{ matrix.bench.target_triple }} --no-run \
--message-format=json > build.log
mkdir -p bench-binary
for name in compare_ffi compare_ffi_memory; do
bin_path=$(jq -r --arg n "$name" 'select(.executable != null and (.target.name == $n)) | .executable' build.log | tail -1)
if [ -z "$bin_path" ] || [ ! -x "$bin_path" ]; then
echo "ERROR: failed to locate $name binary in cargo output" >&2
cat build.log | jq -r 'select(.executable != null) | "\(.target.name) \(.executable)"' >&2
exit 1
fi
cp "$bin_path" "bench-binary/$name"
chmod +x "bench-binary/$name"
echo "$name size: $(wc -c < bench-binary/$name) bytes"
done
- name: Upload bench binary
uses: actions/upload-artifact@v7
with:
name: bench-binary-${{ matrix.bench.id }}
path: bench-binary/
if-no-files-found: error
retention-days: 7
benchmark:
name: Bench ${{ matrix.bench.id }} / ${{ matrix.shard.id }}
needs: [bench-build, bench-matrix]
timeout-minutes: ${{ matrix.bench.timeout_minutes }}
strategy:
# Matrix split target × level. The pre-built binary from
# `bench-build` is what each shard executes, so the runtime
# budget per shard is purely the criterion measurement +
# post-processing. `level22` on i686 is still the natural
# bottleneck (~20 min); every other (target, level) combo
# finishes well under 10 min.
fail-fast: false
matrix:
bench: ${{ fromJSON(needs.bench-matrix.outputs.targets) }}
# Shard plan is resolved in `bench-matrix.outputs.shards`.
# Each shard owns one strategy-grouped level bundle (PR runs
# a single `pr-canonical` shard with level_3 + level_22; main
# runs nine strategy groups — see #164 for the fast/lazy
# split rationale). `shard.levels` is a CSV that we forward
# into `STRUCTURED_ZSTD_BENCH_LEVEL_FILTER` so the bench
# binary iterates the requested levels in one process.
shard: ${{ fromJSON(needs.bench-matrix.outputs.shards) }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
# Needed for: corpus files referenced via `env!("CARGO_MANIFEST_DIR")`
# inside the bench binary, the run-benchmarks.sh script, and
# the python post-processor.
- name: Install target runtime dependencies
if: matrix.bench.runtime_setup != ''
run: ${{ matrix.bench.runtime_setup }}
- name: Download pre-built bench binary
uses: actions/download-artifact@v8
with:
name: bench-binary-${{ matrix.bench.id }}
path: bench-binary
- name: Mark bench binaries executable
# `actions/download-artifact` strips the executable bit
# (downloaded files land as mode 0644). Both binaries shipped
# in the artifact need +x — the memory binary's `-x` check in
# run-benchmarks.sh would otherwise reject it on main pushes.
run: |
chmod +x bench-binary/compare_ffi
if [ -f bench-binary/compare_ffi_memory ]; then
chmod +x bench-binary/compare_ffi_memory
fi
- name: Run benchmarks (filtered to shard's levels)
env:
STRUCTURED_ZSTD_BENCH_TARGET: ${{ matrix.bench.id }}
STRUCTURED_ZSTD_BENCH_TRIPLE: ${{ matrix.bench.target_triple }}
STRUCTURED_ZSTD_BENCH_GENERATED_AT: ${{ github.event_name == 'pull_request' && github.event.pull_request.updated_at || github.event.head_commit.timestamp || github.event.repository.updated_at }}
STRUCTURED_ZSTD_BENCH_LEVEL_FILTER: ${{ matrix.shard.levels }}
# run-benchmarks.sh: re-exec this binary instead of `cargo bench`.
STRUCTURED_ZSTD_BENCH_BIN: ${{ github.workspace }}/bench-binary/compare_ffi
# Memory bench runs only on main pushes — its TrackingAllocator
# measures peak alloc bytes precisely but adds per-allocation
# overhead, so we don't want it on every PR review cycle. On
# main pushes (`event_name == 'push'`) the second binary is
# invoked sequentially by run-benchmarks.sh and its REPORT_MEM
# lines feed the dashboard's `peak_alloc_bytes` metric.
STRUCTURED_ZSTD_BENCH_MEMORY_BIN: ${{ github.event_name == 'push' && format('{0}/bench-binary/compare_ffi_memory', github.workspace) || '' }}
# The prebuilt bench binary is launched directly (not via cargo),
# so `env::var("CARGO_MANIFEST_DIR")` returns None inside it.
# Without this override, `load_decode_corpus_scenario()` falls
# back to the synthetic 1 MiB corpus and the bench label silently
# flips from `decodecorpus-z000033` to `decodecorpus-synthetic-1m`,
# making dashboards diverge from a baseline produced via
# `cargo bench`. Point the binary at the checkout's real fixture.
STRUCTURED_ZSTD_BENCH_CORPUS_PATH: ${{ github.workspace }}/zstd/decodecorpus_files/z000033
run: bash .github/scripts/run-benchmarks.sh
- name: Rename benchmark outputs for matrix artifact
run: |
mv benchmark-results.json benchmark-results.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
mv benchmark-report.md benchmark-report.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
mv benchmark-delta.json benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
mv benchmark-delta.md benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
mv benchmark-relative.json benchmark-relative.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
- name: Upload benchmark shard artifacts
uses: actions/upload-artifact@v7
with:
name: benchmark-shard-${{ matrix.bench.id }}-${{ matrix.shard.id }}
path: |
benchmark-results.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
benchmark-report.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
benchmark-relative.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
if-no-files-found: error
# Intermediate inputs to `benchmark-aggregate`; match the
# 7-day retention used for `bench-binary-*`.
retention-days: 7
benchmark-aggregate:
name: Aggregate benchmark shards per target
needs: [benchmark, bench-matrix]
timeout-minutes: 10
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Generate bot token
id: bot-token
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository
uses: actions/create-github-app-token@v3
with:
app-id: ${{ secrets.RELEASER_APP_ID }}
private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}
- name: Download benchmark shard artifacts
uses: actions/download-artifact@v8
with:
pattern: benchmark-shard-*
path: benchmark-artifacts
- name: Aggregate level shards into per-target files
env:
AGGREGATE_TARGETS: ${{ needs.bench-matrix.outputs.ids_csv }}
run: python3 .github/scripts/aggregate-bench-levels.py
- name: Upload aggregated benchmark artifact
# Single combined artifact carrying per-target consolidated
# files. `merge-benchmarks.py` rglob's the download root so
# it picks them up regardless of subdir layout; this lets
# bench-pages download one artifact instead of one per target.
uses: actions/upload-artifact@v7
with:
name: benchmark-aggregated
path: |
benchmark-results.*.json
benchmark-report.*.md
benchmark-delta.*.json
benchmark-delta.*.md
benchmark-relative.*.json
if-no-files-found: error
retention-days: 7
# Save baseline (main push only). Intentionally NO `fail-on-alert`
# and NO `comment-on-alert` here — regression alerts are handled
# exclusively by the `benchmark-regression-check` job below, which
# runs only on regular developer PRs. Mixing the alert/fail path
# with the save path here would re-create the stuck-baseline
# cascade from #158: a regression on main push would fail the step
# before `save-data-file` ran, freezing the baseline indefinitely.
- name: Save benchmark baseline (main push only)
if: steps.bot-token.outputs.token != '' && github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: benchmark-action/github-action-benchmark@v1
with:
name: "structured-zstd vs C FFI (x86_64-gnu)"
tool: customSmallerIsBetter
output-file-path: benchmark-results.x86_64-gnu.json
github-token: ${{ steps.bot-token.outputs.token }}
auto-push: true
save-data-file: true
comment-on-alert: false
fail-on-alert: false
alert-threshold: "130%"
benchmark-data-dir-path: dev/bench
benchmark-pages:
name: Publish benchmark pages payloads
needs: benchmark-aggregate
if: github.event_name == 'pull_request' || (github.event_name == 'push' && github.ref == 'refs/heads/main')
timeout-minutes: 20
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Generate bot token
id: bot-token
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: actions/create-github-app-token@v3
with:
app-id: ${{ secrets.RELEASER_APP_ID }}
private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}
- name: Checkout gh-pages with push token
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: actions/checkout@v6
with:
ref: gh-pages
token: ${{ steps.bot-token.outputs.token }}
path: gh-pages
# Only the aggregated per-target files feed merge-benchmarks.py.
# The level shards (benchmark-shard-*) are intermediate inputs
# to `benchmark-aggregate` and would otherwise pollute
# merge-benchmarks.py's per-target name extraction with
# `<target>.<level>` keys, so we download just the single
# combined `benchmark-aggregated` artifact here.
- uses: actions/download-artifact@v8
with:
name: benchmark-aggregated
path: benchmark-artifacts
- name: Merge relative/delta payloads
run: python3 .github/scripts/merge-benchmarks.py
- name: Publish benchmark reports to gh-pages
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
run: |
mkdir -p gh-pages/dev/bench
cp merged/benchmark-report.md gh-pages/dev/bench/benchmark-report.md
cp merged/benchmark-delta.json gh-pages/dev/bench/benchmark-delta.json
cp merged/benchmark-delta.md gh-pages/dev/bench/benchmark-delta.md
cp merged/benchmark-summary.md gh-pages/dev/bench/benchmark-summary.md
cp merged/benchmark-delta-summary.md gh-pages/dev/bench/benchmark-delta-summary.md
cp merged/benchmark-relative.json gh-pages/dev/bench/benchmark-relative.json
cp .github/bench-dashboard/index.html gh-pages/dev/bench/index.html
cd gh-pages
git config user.name "github-actions[bot]"
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
git add dev/bench/index.html dev/bench/benchmark-report.md dev/bench/benchmark-delta.json dev/bench/benchmark-delta.md dev/bench/benchmark-relative.json dev/bench/benchmark-summary.md dev/bench/benchmark-delta-summary.md
git diff --cached --quiet || git commit -m "chore(bench): publish benchmark reports"
git push origin gh-pages
# Regression alert gate — runs ONLY on regular developer PRs from
# this repo. Intentionally NOT on:
# * push events (main is the immutable historical record; the
# dev/bench dashboard surfaces regressions visually, no need to
# fail CI red and freeze the publish chain)
# * release-plz PRs (version-bump only, no source changes → perf
# deltas are noise)
# * fork PRs (no access to gh-pages baseline anyway, would just
# emit a confusing comparison-failed line)
#
# Decoupled from `benchmark-aggregate` / `benchmark-pages` so that a
# red alert here does NOT block publish or baseline save (see #158
# for the stuck-baseline cascade this avoids).
benchmark-regression-check:
name: Bench regression alert (developer PR only)
needs: benchmark-aggregate
if: |
github.event_name == 'pull_request' &&
github.event.pull_request.head.repo.full_name == github.repository &&
github.event.pull_request.user.login != 'release-plz[bot]' &&
!startsWith(github.head_ref, 'release-plz-')
timeout-minutes: 5
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
# The aggregated artifact carries `benchmark-results.<target>.json`
# at its root; download into workspace root so the action's
# `output-file-path` resolves without extra path plumbing.
- uses: actions/download-artifact@v8
with:
name: benchmark-aggregated
- name: Compare against baseline + alert (no save, no push)
uses: benchmark-action/github-action-benchmark@v1
with:
name: "structured-zstd vs C FFI (x86_64-gnu)"
tool: customSmallerIsBetter
output-file-path: benchmark-results.x86_64-gnu.json
github-token: ${{ secrets.GITHUB_TOKEN }}
# Read-only comparison: no baseline write, no gh-pages push.
# The save path lives in `benchmark-aggregate` and runs on
# main push only.
auto-push: false
save-data-file: false
# Surface regressions to the PR author + reviewer. Comment is
# posted on the head commit so it shows in the PR conversation.
comment-on-alert: true
alert-comment-cc-users: "@polaz"
alert-threshold: "130%"
fail-threshold: "160%"
# Red CI on >60% regression so a reviewer can't merge by
# accident without acknowledging the delta. Failing here does
# NOT block publish / baseline save (separate job).
fail-on-alert: true
benchmark-data-dir-path: dev/bench