perf(decode): straight-loop short path + donor-gated lookahead ring + SeqSymbol repack #1473

Workflow file for this run

	name: CI

	on:
	push:
	branches: [main]
	# Docs-only / dashboard-only commits don't touch code, tests, or
	# bench inputs — no reason to spin up the full lint → test (3 OS) →
	# cross-i686 → msrv → codecov → fuzz → 27-shard bench pipeline.
	# The dashboard's `index.html` republish path lives in
	# `pages-only.yml`, which is gated on these same paths inversely.
	paths-ignore:
	- '**.md'
	- '.github/bench-dashboard/**'
	- 'docs/**'
	- 'LICENSE*'
	- '.gitignore'
	pull_request:
	branches: [main]
	paths-ignore:
	- '**.md'
	- '.github/bench-dashboard/**'
	- 'docs/**'
	- 'LICENSE*'
	- '.gitignore'

	permissions:
	contents: read

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	# Keep cancellation for PR churn, but never cancel main pushes mid-flight:
	# benchmark baseline + gh-pages merge must complete atomically.
	cancel-in-progress: ${{ !(github.event_name == 'push' && github.ref == 'refs/heads/main') }}

	env:
	CARGO_TERM_COLOR: always

	jobs:
	lint:
	timeout-minutes: 10
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	with:
	components: rustfmt, clippy
	- uses: Swatinem/rust-cache@v2
	- name: Format
	run: cargo fmt --all -- --check
	- name: Clippy
	run: cargo clippy -p structured-zstd --features hash,std,dict_builder -- -D warnings
	- name: Clippy (bench_internals)
	run: cargo clippy -p structured-zstd --features hash,std,dict_builder,bench_internals --benches -- -D warnings
	- name: Gate — compare_ffi/memory must not pull bench_internals
	# Rationale: both `compare_ffi` (timing) and `compare_ffi_memory`
	# (peak alloc) benchmark structured-zstd against libzstd. If
	# `bench_internals` widens our crate's public API surface
	# (visibility changes for `BitReaderReversed` etc.), the Rust
	# side ships compiled differently than what a real downstream
	# consumer uses — biasing every cross-side comparison. Keep
	# bench_internals strictly out of both benches' feature sets.
	#
	# tomllib (Python 3.11+ stdlib) handles both single-line and
	# multi-line TOML arrays, so a future `required-features` array
	# reformat can't sneak `bench_internals` past the gate.
	run: \|
	python3 - <<'PY'
	import sys, tomllib
	with open("zstd/Cargo.toml", "rb") as f:
	cargo = tomllib.load(f)
	benches = {
	bench.get("name"): bench.get("required-features", [])
	for bench in cargo.get("bench", [])
	}
	violators = [
	name for name in ("compare_ffi", "compare_ffi_memory")
	if "bench_internals" in benches.get(name, [])
	]
	if violators:
	for name in violators:
	print(f"::error::{name} must NOT require bench_internals — would bias Rust-vs-FFI parity")
	sys.exit(1)
	PY
	- name: Gate — bench instrumentation must not leak into zstd/src/
	# Rationale: bench-only memory observation (TrackingAllocator,
	# customMem hooks) lives in zstd/benches/. Anything
	# bench-instrumentation-shaped in zstd/src/ would bloat the
	# published crate. Comments are OK; identifier references in
	# actual code are not.
	#
	# The second `rg -v` filters out comment-only lines so a doc
	# comment referencing these names (e.g. "see bench's
	# `TrackingAllocator`") doesn't trip the gate. Matches
	# `path:line:` followed by leading whitespace and a Rust
	# comment prefix (`//`, `///`, `//!`, `/`, ` `).
	run: \|
	leaked=$(rg -n --no-heading \
	'TrackingAllocator\|ALLOC_PEAK\|ALLOC_CURRENT\|TRACKING_ENABLED\|ZSTD_customMem\|customMem\(' \
	zstd/src/ \
	\| rg -v '^[^:]+:[0-9]+:\s(//\|/\\|\*)' \|\| true)
	if [ -n "$leaked" ]; then
	echo "$leaked"
	echo "::error::bench-only instrumentation symbols leaked into zstd/src/"
	exit 1
	fi

	test:
	needs: lint
	timeout-minutes: 15
	strategy:
	# Each OS hits its own runner-image regressions (macos-latest currently
	# ships Homebrew rustup without `cargo`/`rustc` shims under
	# `~/.cargo/bin`, so plain `cargo …` resolves to `rustup-init`). Run
	# every OS so one glitchy image doesn't mask the others.
	fail-fast: false
	matrix:
	os: [ubuntu-latest, windows-latest, macos-latest]
	runs-on: ${{ matrix.os }}
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	- name: Prepend toolchain bin to PATH (macos shim workaround)
	if: runner.os == 'macOS'
	# macos-latest preinstalled rustup ships via Homebrew without the
	# `~/.cargo/bin/{cargo,rustc}` proxy shims. `rustup run stable cargo`
	# works for the outer call but the cargo it launches then invokes
	# `rustc -vV` through the same broken proxy. Putting the toolchain's
	# actual `bin/` ahead on PATH gives every nested invocation the
	# real binaries.
	run: \|
	TC="$(rustup show active-toolchain \| awk '{print $1}')"
	echo "$HOME/.rustup/toolchains/$TC/bin" >> $GITHUB_PATH
	- uses: taiki-e/install-action@nextest
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: ${{ runner.os }}-cargo
	- name: Test
	working-directory: zstd
	run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder
	- name: Doc tests
	run: cargo test --doc -p structured-zstd --features hash,std,dict_builder

	cross-i686:
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	- uses: taiki-e/install-action@nextest
	- name: Install i686 target
	run: rustup target add i686-unknown-linux-gnu
	- name: Install 32-bit libs
	run: sudo apt-get update && sudo apt-get install -y gcc-multilib
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: cross-i686
	- name: Test (i686)
	working-directory: zstd
	run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder --target i686-unknown-linux-gnu

	msrv:
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	with:
	toolchain: "1.92.0"
	- uses: taiki-e/install-action@nextest
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: msrv
	- name: Test (MSRV)
	working-directory: zstd
	run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder

	no-std:
	# Build-only smoke test that the crate compiles under no_std + alloc.
	# The decoder ships #![no_std] with optional `std` feature gating the
	# io::Read/Write impls + runtime CPUID detection; without this gate
	# nothing catches a regression where a new `use std::...` slips into
	# a hot decode path and silently breaks downstream embedded users.
	# Two configurations cover the realistic deployment surface:
	# alloc-only (zero features) and alloc + xxhash content checksum.
	# Clippy runs cargo's compile checks plus linting, so a separate
	# `cargo check` pass would just duplicate work.
	needs: lint
	timeout-minutes: 10
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	with:
	components: clippy
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: no-std
	- name: Clippy (no_std)
	working-directory: zstd
	run: cargo clippy -p structured-zstd --no-default-features -- -D warnings
	- name: Clippy (no_std + hash)
	working-directory: zstd
	run: cargo clippy -p structured-zstd --no-default-features --features hash -- -D warnings

	codecov:
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@nightly
	with:
	components: llvm-tools-preview
	- uses: Swatinem/rust-cache@v2
	- uses: taiki-e/install-action@cargo-llvm-cov
	- run: cargo llvm-cov -p structured-zstd --features hash,std,dict_builder --lcov --output-path lcov.info
	working-directory: zstd
	- uses: codecov/codecov-action@v6
	with:
	files: zstd/lcov.info
	env:
	CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

	fuzz:
	# Short-budget libFuzzer smoke run across all targets in a single
	# runner. The asan+sancov sub-build of `structured-zstd` is the
	# dominant cost (~2 min); running the five targets sequentially in
	# one job amortises that build (cargo cache reuse drops targets
	# 2..5 to a few seconds each) and halves the compute vs a per-
	# target matrix. Wall-clock stays bounded by the bench matrix,
	# which runs in parallel and takes far longer.
	#
	# The repo ships a regression corpus in `zstd/fuzz/artifacts/`;
	# for every target cargo-fuzz replays that corpus first (any old
	# crash that resurfaces fails the job), then runs
	# `-max_total_time` seconds of fresh fuzzing on top.
	name: Fuzz smoke
	needs: lint
	timeout-minutes: 20
	runs-on: ubuntu-latest
	env:
	# Override `rust-toolchain.toml` (which pins stable) so `cargo fuzz`
	# — which requires `-Z sanitizer` from nightly — gets a nightly
	# compiler inside the fuzz crate sub-build.
	RUSTUP_TOOLCHAIN: nightly
	# Explicit target: the prebuilt `cargo-fuzz` binary installed via
	# taiki-e/install-action is statically linked against musl and its
	# `default_target()` probe picks `x86_64-unknown-linux-musl`,
	# which fails because libFuzzer's AddressSanitizer cannot link
	# against a static libc (`sanitizer is incompatible with
	# statically linked libc`). Pinning the gnu target sidesteps the
	# probe and matches the toolchain rustc actually has stdlib for.
	FUZZ_TARGET_TRIPLE: x86_64-unknown-linux-gnu
	# Single source of truth for the fuzz target inventory; both the
	# corpus replay step and the fresh-fuzz step iterate over this.
	FUZZ_TARGETS: "decode encode interop huff0 fse"
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@nightly
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: fuzz
	workspaces: zstd/fuzz
	- name: Install cargo-fuzz
	uses: taiki-e/install-action@v2
	with:
	tool: cargo-fuzz
	- name: Replay regression corpus
	# Drive the existing crash artifacts through each target so
	# any reintroduction of a previously-fixed bug fails CI on
	# the same input that originally surfaced it.
	working-directory: zstd/fuzz
	run: \|
	for target in $FUZZ_TARGETS; do
	if [ -d "artifacts/$target" ] && [ -n "$(ls artifacts/$target 2>/dev/null)" ]; then
	echo "Replaying $(ls artifacts/$target \| wc -l) regression artifacts for $target"
	cargo fuzz run --target "$FUZZ_TARGET_TRIPLE" "$target" artifacts/"$target"/*
	else
	echo "No regression artifacts for $target — skipping replay"
	fi
	done
	- name: Fuzz (90s per target)
	working-directory: zstd/fuzz
	# `-max_total_time` is libFuzzer's own time cap; on top of
	# the GitHub Actions timeout-minutes it gives us a hard
	# ceiling even if a target wedges in setup.
	run: \|
	for target in $FUZZ_TARGETS; do
	echo "::group::Fuzz $target"
	cargo fuzz run --target "$FUZZ_TARGET_TRIPLE" "$target" -- -max_total_time=90 -timeout=30
	echo "::endgroup::"
	done
	- name: Upload new crash artifacts on failure
	if: failure()
	uses: actions/upload-artifact@v7
	with:
	name: fuzz-artifacts
	path: zstd/fuzz/artifacts/
	if-no-files-found: ignore
	retention-days: 14

	bench-matrix:
	# Canonical bench target inventory. Every downstream bench job
	# (`bench-build`, `benchmark`, `benchmark-aggregate`,
	# `benchmark-pages`) consumes this so a new target ID is added
	# in exactly one place. `build_setup` runs only on `bench-build`;
	# `runtime_setup` runs only on the bench shards (e.g. `i686-gnu`
	# needs gcc-multilib's 32-bit loader at runtime, while `x86_64-musl`
	# only needs `musl-tools` for the build).
	#
	# Release-plz PRs are version-bump-only (no source changes), so the
	# entire bench pipeline (matrix → build → shards → aggregate →
	# pages → regression) is gated out for them via this `if:` filter.
	# The skip cascades through `needs:` so downstream bench jobs
	# also stay green-skipped on those PRs. See #164.
	name: Resolve bench target matrix
	needs: lint
	if: \|
	github.event_name != 'pull_request' \|\|
	(github.event.pull_request.user.login != 'release-plz[bot]' &&
	!startsWith(github.head_ref, 'release-plz-'))
	runs-on: ubuntu-latest
	outputs:
	targets: ${{ steps.set.outputs.targets }}
	ids_csv: ${{ steps.set.outputs.ids_csv }}
	shards: ${{ steps.set.outputs.shards }}
	shards_csv: ${{ steps.set.outputs.shards_csv }}
	steps:
	- id: set
	env:
	# Drives the shard plan below. On a `pull_request` we run
	# only the two canonical levels (`level_3_dfast` = donor
	# default, `level_22_btultra2` = max compression) bundled
	# into a single shard per target — three shards total, cheap
	# PR feedback. On a `push: main` (post-merge), one shard per
	# strategy group runs — nine groups (fast split into
	# `fast-neg` / `fast-pos`; lazy split into `lazy-lower` /
	# `lazy-upper` to keep the worst-case per-shard wall under
	# the 120-min CI cap) × three targets = 27 shards, so the
	# published gh-pages snapshot keeps full coverage for the
	# dashboard + tagged baselines (#164).
	EVENT_NAME: ${{ github.event_name }}
	run: \|
	cat > targets.json <<'EOF'
	[
	{
	"id": "x86_64-gnu",
	"target_triple": "x86_64-unknown-linux-gnu",
	"build_setup": "",
	"runtime_setup": "",
	"timeout_minutes": 120
	},
	{
	"id": "i686-gnu",
	"target_triple": "i686-unknown-linux-gnu",
	"build_setup": "sudo apt-get update && sudo apt-get install -y gcc-multilib libc6-dev-i386",
	"runtime_setup": "sudo apt-get update && sudo apt-get install -y libc6-dev-i386",
	"timeout_minutes": 120
	},
	{
	"id": "x86_64-musl",
	"target_triple": "x86_64-unknown-linux-musl",
	"build_setup": "sudo apt-get update && sudo apt-get install -y musl-tools",
	"runtime_setup": "",
	"timeout_minutes": 120
	}
	]
	EOF
	targets_compact=$(jq -c . targets.json)
	ids_csv=$(jq -r '[.[].id] \| join(",")' targets.json)
	echo "targets=$targets_compact" >> "$GITHUB_OUTPUT"
	echo "ids_csv=$ids_csv" >> "$GITHUB_OUTPUT"
	echo "Bench targets: $ids_csv"

	# Shard plan: each entry runs a comma-separated set of
	# levels through one bench binary invocation via the
	# `STRUCTURED_ZSTD_BENCH_LEVEL_FILTER` env var. `id` drives
	# the artifact name (`benchmark-shard-<target>-<id>`) and
	# the per-file suffix in the markdown / JSON outputs.
	#
	# PR event = single shard covering the two canonical levels
	# so reviewers see ratio + speed + memory deltas on the
	# default-level path (level_3_dfast) and the max-compression
	# path (level_22_btultra2) within minutes. Strategy groups
	# mirror `clevels.h` + `StrategyTag::for_level` so an
	# entire strategy's levels share a runner — keeps per-job
	# build overhead amortised across the levels of that family.
	if [ "$EVENT_NAME" = "pull_request" ]; then
	cat > shards.json <<'EOF'
	[
	{
	"id": "pr-canonical",
	"levels": "level_3_dfast,level_22_btultra2"
	}
	]
	EOF
	else
	cat > shards.json <<'EOF'
	[
	{
	"id": "fast-neg",
	"levels": "level_-7_fast,level_-6_fast,level_-5_fast,level_-4_fast,level_-3_fast"
	},
	{
	"id": "fast-pos",
	"levels": "level_-2_fast,level_-1_fast,level_1_fast"
	},
	{
	"id": "dfast",
	"levels": "level_2_dfast,level_3_dfast"
	},
	{
	"id": "greedy",
	"levels": "level_4_greedy"
	},
	{
	"id": "lazy-lower",
	"levels": "level_5_lazy,level_6_lazy,level_7_lazy,level_8_lazy,level_9_lazy"
	},
	{
	"id": "lazy-upper",
	"levels": "level_10_lazy,level_11_lazy,level_12_lazy,level_13_lazy,level_14_lazy,level_15_lazy"
	},
	{
	"id": "btopt",
	"levels": "level_16_btopt,level_17_btopt"
	},
	{
	"id": "btultra",
	"levels": "level_18_btultra,level_19_btultra"
	},
	{
	"id": "btultra2",
	"levels": "level_20_btultra2,level_21_btultra2,level_22_btultra2"
	}
	]
	EOF
	fi
	shards_compact=$(jq -c . shards.json)
	shards_csv=$(jq -r '[.[].id] \| join(",")' shards.json)
	echo "shards=$shards_compact" >> "$GITHUB_OUTPUT"
	echo "shards_csv=$shards_csv" >> "$GITHUB_OUTPUT"
	echo "Bench shards ($EVENT_NAME): $shards_csv"

	bench-build:
	# Build the criterion `compare_ffi` binary once per target. Every
	# downstream bench shard (target × level) downloads the binary
	# via `bench-binary-<target>` artifact and runs it directly — no
	# rebuild per shard. Saves ~4-7 min on each of the 18 shard
	# runners.
	name: Build bench binary (${{ matrix.bench.id }})
	needs: [lint, bench-matrix]
	timeout-minutes: 20
	strategy:
	fail-fast: false
	matrix:
	bench: ${{ fromJSON(needs.bench-matrix.outputs.targets) }}
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	- name: Install benchmark target
	run: rustup target add ${{ matrix.bench.target_triple }}
	- name: Install build toolchain dependencies
	if: matrix.bench.build_setup != ''
	run: ${{ matrix.bench.build_setup }}
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: bench-${{ matrix.bench.id }}
	- name: Build compare_ffi + compare_ffi_memory bench binaries
	env:
	CC_x86_64_unknown_linux_musl: musl-gcc
	# The donor zstd-sys C library uses runtime feature detection
	# (is_x86_feature_detected!-equivalent) so it transparently
	# picks up BMI2/AVX2/etc. on the runner. Pure-Rust hot paths
	# gate intrinsics on COMPILE-time cfg!(target_feature = ...)
	# and the default rustc x86_64 target ships with SSE2 only.
	# Without explicit target selection the bench compares
	# "donor with full ISA" vs "us with SSE2 baseline" — not
	# apples-to-apples.
	#
	# Use a DETERMINISTIC baseline (x86-64-v3 = BMI2 + AVX2 +
	# everything in the Haswell ISA, the 2013+ x86_64 baseline)
	# ONLY for x86_64 targets via target.<triple>.rustflags. NOT
	# target-cpu=native: that picks whatever CPU the BUILD runner
	# has, which (a) varies across github-runners, (b) crashes
	# with SIGILL when a bench shard runner lacks features the
	# build runner had, and (c) is meaningless for cross-compile
	# targets like i686-unknown-linux-gnu.
	#
	# i686 / non-x86 / musl targets keep the default rustc
	# baseline. Measured +8.5% on
	# decompress/level_-1_fast/decodecorpus-z000033/c_stream on
	# i9-9900K; the win comes from _bzhi_u64-backed
	# mask_lower_bits in the FSE state-update hot path.
	CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: "-C target-cpu=x86-64-v3"
	CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_RUSTFLAGS: "-C target-cpu=x86-64-v3"
	run: \|
	# `--no-run` builds without executing; `--message-format=json`
	# exposes the resolved binary path in the build log so we
	# can ship just the executable to shards. Building BOTH
	# bench binaries in one cargo invocation reuses dependency
	# compilation between them (zstd-sys, criterion, etc.).
	cargo bench --bench compare_ffi --bench compare_ffi_memory \
	-p structured-zstd --features dict_builder \
	--target ${{ matrix.bench.target_triple }} --no-run \
	--message-format=json > build.log
	mkdir -p bench-binary
	for name in compare_ffi compare_ffi_memory; do
	bin_path=$(jq -r --arg n "$name" 'select(.executable != null and (.target.name == $n)) \| .executable' build.log \| tail -1)
	if [ -z "$bin_path" ] \|\| [ ! -x "$bin_path" ]; then
	echo "ERROR: failed to locate $name binary in cargo output" >&2
	cat build.log \| jq -r 'select(.executable != null) \| "\(.target.name) \(.executable)"' >&2
	exit 1
	fi
	cp "$bin_path" "bench-binary/$name"
	chmod +x "bench-binary/$name"
	echo "$name size: $(wc -c < bench-binary/$name) bytes"
	done
	- name: Upload bench binary
	uses: actions/upload-artifact@v7
	with:
	name: bench-binary-${{ matrix.bench.id }}
	path: bench-binary/
	if-no-files-found: error
	retention-days: 7

	benchmark:
	name: Bench ${{ matrix.bench.id }} / ${{ matrix.shard.id }}
	needs: [bench-build, bench-matrix]
	timeout-minutes: ${{ matrix.bench.timeout_minutes }}
	strategy:
	# Matrix split target × level. The pre-built binary from
	# `bench-build` is what each shard executes, so the runtime
	# budget per shard is purely the criterion measurement +
	# post-processing. `level22` on i686 is still the natural
	# bottleneck (~20 min); every other (target, level) combo
	# finishes well under 10 min.
	fail-fast: false
	matrix:
	bench: ${{ fromJSON(needs.bench-matrix.outputs.targets) }}
	# Shard plan is resolved in `bench-matrix.outputs.shards`.
	# Each shard owns one strategy-grouped level bundle (PR runs
	# a single `pr-canonical` shard with level_3 + level_22; main
	# runs nine strategy groups — see #164 for the fast/lazy
	# split rationale). `shard.levels` is a CSV that we forward
	# into `STRUCTURED_ZSTD_BENCH_LEVEL_FILTER` so the bench
	# binary iterates the requested levels in one process.
	shard: ${{ fromJSON(needs.bench-matrix.outputs.shards) }}
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	# Needed for: corpus files referenced via `env!("CARGO_MANIFEST_DIR")`
	# inside the bench binary, the run-benchmarks.sh script, and
	# the python post-processor.

	- name: Install target runtime dependencies
	if: matrix.bench.runtime_setup != ''
	run: ${{ matrix.bench.runtime_setup }}

	- name: Download pre-built bench binary
	uses: actions/download-artifact@v8
	with:
	name: bench-binary-${{ matrix.bench.id }}
	path: bench-binary
	- name: Mark bench binaries executable
	# `actions/download-artifact` strips the executable bit
	# (downloaded files land as mode 0644). Both binaries shipped
	# in the artifact need +x — the memory binary's `-x` check in
	# run-benchmarks.sh would otherwise reject it on main pushes.
	run: \|
	chmod +x bench-binary/compare_ffi
	if [ -f bench-binary/compare_ffi_memory ]; then
	chmod +x bench-binary/compare_ffi_memory
	fi

	- name: Run benchmarks (filtered to shard's levels)
	env:
	STRUCTURED_ZSTD_BENCH_TARGET: ${{ matrix.bench.id }}
	STRUCTURED_ZSTD_BENCH_TRIPLE: ${{ matrix.bench.target_triple }}
	STRUCTURED_ZSTD_BENCH_GENERATED_AT: ${{ github.event_name == 'pull_request' && github.event.pull_request.updated_at \|\| github.event.head_commit.timestamp \|\| github.event.repository.updated_at }}
	STRUCTURED_ZSTD_BENCH_LEVEL_FILTER: ${{ matrix.shard.levels }}
	# run-benchmarks.sh: re-exec this binary instead of `cargo bench`.
	STRUCTURED_ZSTD_BENCH_BIN: ${{ github.workspace }}/bench-binary/compare_ffi
	# Memory bench runs only on main pushes — its TrackingAllocator
	# measures peak alloc bytes precisely but adds per-allocation
	# overhead, so we don't want it on every PR review cycle. On
	# main pushes (`event_name == 'push'`) the second binary is
	# invoked sequentially by run-benchmarks.sh and its REPORT_MEM
	# lines feed the dashboard's `peak_alloc_bytes` metric.
	STRUCTURED_ZSTD_BENCH_MEMORY_BIN: ${{ github.event_name == 'push' && format('{0}/bench-binary/compare_ffi_memory', github.workspace) \|\| '' }}
	# The prebuilt bench binary is launched directly (not via cargo),
	# so `env::var("CARGO_MANIFEST_DIR")` returns None inside it.
	# Without this override, `load_decode_corpus_scenario()` falls
	# back to the synthetic 1 MiB corpus and the bench label silently
	# flips from `decodecorpus-z000033` to `decodecorpus-synthetic-1m`,
	# making dashboards diverge from a baseline produced via
	# `cargo bench`. Point the binary at the checkout's real fixture.
	STRUCTURED_ZSTD_BENCH_CORPUS_PATH: ${{ github.workspace }}/zstd/decodecorpus_files/z000033
	run: bash .github/scripts/run-benchmarks.sh

	- name: Rename benchmark outputs for matrix artifact
	run: \|
	mv benchmark-results.json benchmark-results.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	mv benchmark-report.md benchmark-report.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	mv benchmark-delta.json benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	mv benchmark-delta.md benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	mv benchmark-relative.json benchmark-relative.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json

	- name: Upload benchmark shard artifacts
	uses: actions/upload-artifact@v7
	with:
	name: benchmark-shard-${{ matrix.bench.id }}-${{ matrix.shard.id }}
	path: \|
	benchmark-results.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	benchmark-report.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	benchmark-relative.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	if-no-files-found: error
	# Intermediate inputs to `benchmark-aggregate`; match the
	# 7-day retention used for `bench-binary-*`.
	retention-days: 7

	benchmark-aggregate:
	name: Aggregate benchmark shards per target
	needs: [benchmark, bench-matrix]
	timeout-minutes: 10
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	- name: Generate bot token
	id: bot-token
	if: github.event_name == 'push' \|\| github.event.pull_request.head.repo.full_name == github.repository
	uses: actions/create-github-app-token@v3
	with:
	app-id: ${{ secrets.RELEASER_APP_ID }}
	private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}

	- name: Download benchmark shard artifacts
	uses: actions/download-artifact@v8
	with:
	pattern: benchmark-shard-*
	path: benchmark-artifacts

	- name: Aggregate level shards into per-target files
	env:
	AGGREGATE_TARGETS: ${{ needs.bench-matrix.outputs.ids_csv }}
	run: python3 .github/scripts/aggregate-bench-levels.py

	- name: Upload aggregated benchmark artifact
	# Single combined artifact carrying per-target consolidated
	# files. `merge-benchmarks.py` rglob's the download root so
	# it picks them up regardless of subdir layout; this lets
	# bench-pages download one artifact instead of one per target.
	uses: actions/upload-artifact@v7
	with:
	name: benchmark-aggregated
	path: \|
	benchmark-results.*.json
	benchmark-report.*.md
	benchmark-delta.*.json
	benchmark-delta.*.md
	benchmark-relative.*.json
	if-no-files-found: error
	retention-days: 7

	# Save baseline (main push only). Intentionally NO `fail-on-alert`
	# and NO `comment-on-alert` here — regression alerts are handled
	# exclusively by the `benchmark-regression-check` job below, which
	# runs only on regular developer PRs. Mixing the alert/fail path
	# with the save path here would re-create the stuck-baseline
	# cascade from #158: a regression on main push would fail the step
	# before `save-data-file` ran, freezing the baseline indefinitely.
	- name: Save benchmark baseline (main push only)
	if: steps.bot-token.outputs.token != '' && github.event_name == 'push' && github.ref == 'refs/heads/main'
	uses: benchmark-action/github-action-benchmark@v1
	with:
	name: "structured-zstd vs C FFI (x86_64-gnu)"
	tool: customSmallerIsBetter
	output-file-path: benchmark-results.x86_64-gnu.json
	github-token: ${{ steps.bot-token.outputs.token }}
	auto-push: true
	save-data-file: true
	comment-on-alert: false
	fail-on-alert: false
	alert-threshold: "130%"
	benchmark-data-dir-path: dev/bench

	benchmark-pages:
	name: Publish benchmark pages payloads
	needs: benchmark-aggregate
	if: github.event_name == 'pull_request' \|\| (github.event_name == 'push' && github.ref == 'refs/heads/main')
	timeout-minutes: 20
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	- name: Generate bot token
	id: bot-token
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	uses: actions/create-github-app-token@v3
	with:
	app-id: ${{ secrets.RELEASER_APP_ID }}
	private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}

	- name: Checkout gh-pages with push token
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	uses: actions/checkout@v6
	with:
	ref: gh-pages
	token: ${{ steps.bot-token.outputs.token }}
	path: gh-pages

	# Only the aggregated per-target files feed merge-benchmarks.py.
	# The level shards (benchmark-shard-*) are intermediate inputs
	# to `benchmark-aggregate` and would otherwise pollute
	# merge-benchmarks.py's per-target name extraction with
	# `<target>.<level>` keys, so we download just the single
	# combined `benchmark-aggregated` artifact here.
	- uses: actions/download-artifact@v8
	with:
	name: benchmark-aggregated
	path: benchmark-artifacts

	- name: Merge relative/delta payloads
	run: python3 .github/scripts/merge-benchmarks.py

	- name: Publish benchmark reports to gh-pages
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	run: \|
	mkdir -p gh-pages/dev/bench
	cp merged/benchmark-report.md gh-pages/dev/bench/benchmark-report.md
	cp merged/benchmark-delta.json gh-pages/dev/bench/benchmark-delta.json
	cp merged/benchmark-delta.md gh-pages/dev/bench/benchmark-delta.md
	cp merged/benchmark-summary.md gh-pages/dev/bench/benchmark-summary.md
	cp merged/benchmark-delta-summary.md gh-pages/dev/bench/benchmark-delta-summary.md
	cp merged/benchmark-relative.json gh-pages/dev/bench/benchmark-relative.json
	cp .github/bench-dashboard/index.html gh-pages/dev/bench/index.html
	cd gh-pages
	git config user.name "github-actions[bot]"
	git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
	git add dev/bench/index.html dev/bench/benchmark-report.md dev/bench/benchmark-delta.json dev/bench/benchmark-delta.md dev/bench/benchmark-relative.json dev/bench/benchmark-summary.md dev/bench/benchmark-delta-summary.md
	git diff --cached --quiet \|\| git commit -m "chore(bench): publish benchmark reports"
	git push origin gh-pages

	# Regression alert gate — runs ONLY on regular developer PRs from
	# this repo. Intentionally NOT on:
	# * push events (main is the immutable historical record; the
	# dev/bench dashboard surfaces regressions visually, no need to
	# fail CI red and freeze the publish chain)
	# * release-plz PRs (version-bump only, no source changes → perf
	# deltas are noise)
	# * fork PRs (no access to gh-pages baseline anyway, would just
	# emit a confusing comparison-failed line)
	#
	# Decoupled from `benchmark-aggregate` / `benchmark-pages` so that a
	# red alert here does NOT block publish or baseline save (see #158
	# for the stuck-baseline cascade this avoids).
	benchmark-regression-check:
	name: Bench regression alert (developer PR only)
	needs: benchmark-aggregate
	if: \|
	github.event_name == 'pull_request' &&
	github.event.pull_request.head.repo.full_name == github.repository &&
	github.event.pull_request.user.login != 'release-plz[bot]' &&
	!startsWith(github.head_ref, 'release-plz-')
	timeout-minutes: 5
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	# The aggregated artifact carries `benchmark-results.<target>.json`
	# at its root; download into workspace root so the action's
	# `output-file-path` resolves without extra path plumbing.
	- uses: actions/download-artifact@v8
	with:
	name: benchmark-aggregated

	- name: Compare against baseline + alert (no save, no push)
	uses: benchmark-action/github-action-benchmark@v1
	with:
	name: "structured-zstd vs C FFI (x86_64-gnu)"
	tool: customSmallerIsBetter
	output-file-path: benchmark-results.x86_64-gnu.json
	github-token: ${{ secrets.GITHUB_TOKEN }}
	# Read-only comparison: no baseline write, no gh-pages push.
	# The save path lives in `benchmark-aggregate` and runs on
	# main push only.
	auto-push: false
	save-data-file: false
	# Surface regressions to the PR author + reviewer. Comment is
	# posted on the head commit so it shows in the PR conversation.
	comment-on-alert: true
	alert-comment-cc-users: "@polaz"
	alert-threshold: "130%"
	fail-threshold: "160%"
	# Red CI on >60% regression so a reviewer can't merge by
	# accident without acknowledging the delta. Failing here does
	# NOT block publish / baseline save (separate job).
	fail-on-alert: true
	benchmark-data-dir-path: dev/bench

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

perf(decode): straight-loop short path + donor-gated lookahead ring + SeqSymbol repack #1473

Workflow file

perf(decode): straight-loop short path + donor-gated lookahead ring + SeqSymbol repack #1473

Uh oh!

Workflow file for this run