perf(hc): lazy-skip + separate dictMatchState beat C on small-window dict compress #2051

Workflow file for this run

	name: CI

	on:
	push:
	branches: [main]
	# Docs-only / dashboard-only commits don't touch code, tests, or
	# bench inputs — no reason to spin up the full lint → test (3 OS) →
	# cross-i686 → msrv → codecov → fuzz → 27-shard bench pipeline.
	# The dashboard's `index.html` republish path lives in
	# `pages-only.yml`, which is gated on these same paths inversely.
	paths-ignore:
	- '**.md'
	- '.github/bench-dashboard/**'
	- 'docs/**'
	- 'LICENSE*'
	- '.gitignore'
	pull_request:
	branches: [main]
	paths-ignore:
	- '**.md'
	- '.github/bench-dashboard/**'
	- 'docs/**'
	- 'LICENSE*'
	- '.gitignore'
	# Manual trigger: lets the wasm dashboard shard (`bench-wasm`) be
	# refreshed on demand for changes the push path doesn't auto-gate on
	# — e.g. a wasm bench-harness / parser edit that moves the published
	# numbers without touching the `rust_core` paths that gate the shard.
	workflow_dispatch:

	permissions:
	contents: read

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	# Keep cancellation for PR churn, but never cancel main pushes mid-flight:
	# benchmark baseline + gh-pages merge must complete atomically.
	cancel-in-progress: ${{ !(github.event_name == 'push' && github.ref == 'refs/heads/main') }}

	env:
	CARGO_TERM_COLOR: always

	jobs:
	changes:
	# Cheap path probe (~10s) consumed by the bench pipeline gate below.
	# Bench numbers (Rust vs C FFI) can only move when the core crate,
	# the workspace manifests that pin deps / profiles, or the toolchain
	# file change. A push touching ONLY wasm / npm / ts / js / CI / docs
	# cannot move them, so `bench-matrix` (and its whole downstream
	# cascade) is skipped on such pushes — see the gate on that job.
	name: Detect changed paths
	runs-on: ubuntu-latest
	outputs:
	rust_core: ${{ steps.filter.outputs.rust_core }}
	steps:
	- uses: actions/checkout@v6
	- uses: dorny/paths-filter@v4
	id: filter
	with:
	filters: \|
	# `rust_core` gates BOTH bench pipelines (native `bench-matrix`
	# and the wasm-vs-bokuweb `bench-wasm` shard). The wasm payload
	# compiles the core crate to wasm32, so the numbers it tracks
	# move exactly when the core crate / manifests / toolchain move,
	# NOT when the wasm-crate / npm glue changes (that path is
	# validated by the always-on `wasm` job instead). A push that
	# touches only wasm / npm / ts / js / CI / docs cannot move the
	# compression behaviour, so both bench cascades skip it.
	rust_core:
	- 'zstd/**'
	- 'Cargo.toml'
	- 'Cargo.lock'
	- 'rust-toolchain.toml'

	lint:
	timeout-minutes: 10
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	with:
	components: rustfmt, clippy
	- uses: Swatinem/rust-cache@v2
	- name: Format
	run: cargo fmt --all -- --check
	- name: Clippy
	run: cargo clippy -p structured-zstd --features hash,std,dict_builder -- -D warnings
	- name: Clippy (bench_internals)
	run: cargo clippy -p structured-zstd --features hash,std,dict_builder,bench_internals --benches -- -D warnings
	- name: Gate — compare_ffi/memory must not pull bench_internals
	# Rationale: both `compare_ffi` (timing) and `compare_ffi_memory`
	# (peak alloc) benchmark structured-zstd against libzstd. If
	# `bench_internals` widens our crate's public API surface
	# (visibility changes for `BitReaderReversed` etc.), the Rust
	# side ships compiled differently than what a real downstream
	# consumer uses — biasing every cross-side comparison. Keep
	# bench_internals strictly out of both benches' feature sets.
	#
	# tomllib (Python 3.11+ stdlib) handles both single-line and
	# multi-line TOML arrays, so a future `required-features` array
	# reformat can't sneak `bench_internals` past the gate.
	run: \|
	python3 - <<'PY'
	import sys, tomllib
	with open("zstd/Cargo.toml", "rb") as f:
	cargo = tomllib.load(f)
	benches = {
	bench.get("name"): bench.get("required-features", [])
	for bench in cargo.get("bench", [])
	}
	violators = [
	name for name in ("compare_ffi", "compare_ffi_memory")
	if "bench_internals" in benches.get(name, [])
	]
	if violators:
	for name in violators:
	print(f"::error::{name} must NOT require bench_internals — would bias Rust-vs-FFI parity")
	sys.exit(1)
	PY
	- name: Gate — bench instrumentation must not leak into zstd/src/
	# Rationale: bench-only memory observation (TrackingAllocator,
	# customMem hooks) lives in zstd/benches/. Anything
	# bench-instrumentation-shaped in zstd/src/ would bloat the
	# published crate. Comments are OK; identifier references in
	# actual code are not.
	#
	# The second `rg -v` filters out comment-only lines so a doc
	# comment referencing these names (e.g. "see bench's
	# `TrackingAllocator`") doesn't trip the gate. Matches
	# `path:line:` followed by leading whitespace and a Rust
	# comment prefix (`//`, `///`, `//!`, `/`, ` `).
	run: \|
	leaked=$(rg -n --no-heading \
	'TrackingAllocator\|ALLOC_PEAK\|ALLOC_CURRENT\|TRACKING_ENABLED\|ZSTD_customMem\|customMem\(' \
	zstd/src/ \
	\| rg -v '^[^:]+:[0-9]+:\s(//\|/\\|\*)' \|\| true)
	if [ -n "$leaked" ]; then
	echo "$leaked"
	echo "::error::bench-only instrumentation symbols leaked into zstd/src/"
	exit 1
	fi

	c-abi:
	# Builds the libzstd-compatible C ABI front end and verifies it is a real
	# drop-in: vendored headers match upstream verbatim, the cdylib advertises
	# SONAME libzstd.so.1, every declared symbol is exported, and a genuine C
	# consumer links + round-trips through the vendored header.
	needs: lint
	timeout-minutes: 12
	runs-on: ubuntu-latest
	env:
	UPSTREAM_TAG: v1.5.7
	steps:
	- uses: actions/checkout@v6
	with:
	# Build + test + header-diff job (no push); don't leave the token
	# in the checkout's git config.
	persist-credentials: false
	- uses: dtolnay/rust-toolchain@stable
	with:
	components: clippy
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: c-abi
	- name: Clippy (c-api)
	run: cargo clippy -p structured-zstd-c --all-targets -- -D warnings
	- name: Unit + ABI tests
	run: cargo test -p structured-zstd-c
	- name: 32-bit toolchain (i686)
	# Upstream libzstd ships on 32-bit platforms; the drop-in C ABI must
	# keep building and passing there too (size_t error encoding, struct
	# layouts, and overflow guards all change shape at 32-bit usize).
	run: \|
	rustup target add i686-unknown-linux-gnu
	sudo apt-get update
	sudo apt-get install -y gcc-multilib
	- name: Clippy (c-api, i686)
	run: cargo clippy -p structured-zstd-c --all-targets --target i686-unknown-linux-gnu -- -D warnings
	- name: Unit + ABI tests (i686)
	run: cargo test -p structured-zstd-c --target i686-unknown-linux-gnu
	- name: Vendored headers match upstream verbatim
	# The headers are copied byte-for-byte from the pinned upstream tag; a
	# diff means someone edited a vendored header (forbidden) or the pin
	# moved without re-vendoring.
	run: \|
	set -euo pipefail
	base="https://raw.githubusercontent.com/facebook/zstd/${UPSTREAM_TAG}/lib"
	for h in zstd.h zdict.h zstd_errors.h; do
	curl --retry 5 --retry-all-errors --retry-delay 2 -fsSL "${base}/${h}" -o "/tmp/${h}.upstream"
	if ! diff -u "/tmp/${h}.upstream" "c-api/include/${h}"; then
	echo "::error::c-api/include/${h} diverges from upstream ${UPSTREAM_TAG}"
	exit 1
	fi
	done
	echo "vendored headers identical to upstream ${UPSTREAM_TAG}"
	- name: Build cdylib + staticlib
	run: cargo build -p structured-zstd-c
	- name: SONAME is libzstd.so.1
	run: \|
	set -euo pipefail
	so=target/debug/libstructured_zstd.so
	soname=$(readelf -d "$so" \| sed -n 's/.SONAME.\[$.*$\]/\1/p')
	echo "SONAME=$soname"
	test "$soname" = "libzstd.so.1"
	- name: All declared symbols are exported
	run: \|
	set -euo pipefail
	so=target/debug/libstructured_zstd.so
	exported=$(nm -D --defined-only "$so" \| awk '{print $NF}')
	missing=0
	for sym in \
	ZSTD_compress ZSTD_decompress ZSTD_compressBound \
	ZSTD_getFrameContentSize ZSTD_findFrameCompressedSize \
	ZSTD_isError ZSTD_getErrorCode ZSTD_getErrorName ZSTD_getErrorString \
	ZSTD_minCLevel ZSTD_maxCLevel ZSTD_defaultCLevel \
	ZSTD_versionNumber ZSTD_versionString \
	ZSTD_createCCtx ZSTD_freeCCtx ZSTD_createDCtx ZSTD_freeDCtx \
	ZSTD_compressCCtx ZSTD_decompressDCtx ZSTD_sizeof_CCtx ZSTD_sizeof_DCtx \
	ZSTD_frameHeaderSize ZSTD_getFrameHeader ZSTD_getFrameHeader_advanced \
	ZSTD_findDecompressedSize ZSTD_decompressBound \
	ZDICT_trainFromBuffer ZDICT_finalizeDictionary ZDICT_getDictID \
	ZDICT_getDictHeaderSize ZDICT_isError ZDICT_getErrorName \
	ZSTD_CCtx_loadDictionary ZSTD_CCtx_loadDictionary_byReference \
	ZSTD_CCtx_loadDictionary_advanced ZSTD_CCtx_refCDict \
	ZSTD_CCtx_refPrefix ZSTD_CCtx_refPrefix_advanced \
	ZSTD_DCtx_loadDictionary ZSTD_DCtx_loadDictionary_byReference \
	ZSTD_DCtx_loadDictionary_advanced ZSTD_DCtx_refDDict ZSTD_DCtx_refPrefix \
	ZSTD_DCtx_refPrefix_advanced \
	ZSTD_compress_usingCDict ZSTD_decompress_usingDDict \
	ZSTD_createCDict ZSTD_createCDict_byReference ZSTD_freeCDict \
	ZSTD_createDDict ZSTD_createDDict_byReference ZSTD_freeDDict \
	ZSTD_sizeof_CDict ZSTD_sizeof_DDict \
	ZSTD_getDictID_fromCDict ZSTD_getDictID_fromDDict \
	ZSTD_compress_usingDict ZSTD_decompress_usingDict \
	ZSTD_createCDict_advanced ZSTD_createDDict_advanced \
	ZSTD_compress_usingCDict_advanced \
	ZSTD_getDictID_fromDict ZSTD_getDictID_fromFrame \
	ZSTD_estimateCCtxSize ZSTD_estimateCCtxSize_usingCParams \
	ZSTD_estimateCStreamSize_usingCParams \
	ZSTD_estimateDCtxSize ZSTD_estimateCStreamSize ZSTD_estimateDStreamSize \
	ZSTD_getDecompressedSize ZSTD_sizeof_CStream ZSTD_sizeof_DStream \
	ZDICT_trainFromBuffer_fastCover ZDICT_optimizeTrainFromBuffer_fastCover; do
	if ! grep -qx "$sym" <<<"$exported"; then
	echo "::error::symbol $sym not exported from $so"
	missing=1
	fi
	done
	test "$missing" -eq 0
	- name: pkg-config reports upstream version
	run: \|
	set -euo pipefail
	# Validate every discovered libzstd.pc, not just the first hit: a
	# restored cache can leave a stale file ahead of the fresh build.
	mapfile -t pcs < <(find target -type f -name libzstd.pc)
	test "${#pcs[@]}" -gt 0
	for pc in "${pcs[@]}"; do
	echo "checking $pc"; cat "$pc"
	grep -qx "Version: 1.5.7" "$pc"
	done
	- name: Real C consumer links + round-trips
	run: \|
	set -euo pipefail
	# The cdylib advertises SONAME libzstd.so.1, so a consumer records a
	# NEEDED dependency on that name; provide it via a symlink. Also
	# expose the canonical `libzstd.so` link name and link with `-lzstd`,
	# exactly the path a real drop-in C consumer uses.
	ln -sf libstructured_zstd.so target/debug/libzstd.so.1
	ln -sf libstructured_zstd.so target/debug/libzstd.so
	cc -std=c11 -Wall -Wextra -Ic-api/include c-api/tests/c_consumer.c \
	-Ltarget/debug -lzstd -o /tmp/c_consumer
	LD_LIBRARY_PATH=target/debug /tmp/c_consumer
	- name: musl static drop-in builds + exports symbols
	# musl is a std target (not no-std); its default `+crt-static` profile
	# makes the static archive `libstructured_zstd.a` the canonical drop-in
	# for Alpine / fully-static binaries (the cdylib is dropped under
	# crt-static, which is expected). Verify the archive builds and carries
	# the exported wrappers.
	run: \|
	set -euo pipefail
	rustup target add x86_64-unknown-linux-musl
	cargo build -p structured-zstd-c --target x86_64-unknown-linux-musl
	a=target/x86_64-unknown-linux-musl/debug/libstructured_zstd.a
	test -f "$a"
	for sym in ZSTD_compress ZSTD_decompress ZSTD_versionNumber ZSTD_getFrameHeader; do
	nm "$a" \| grep -qE " T ${sym}$" \|\| { echo "::error::$sym missing from musl staticlib"; exit 1; }
	done
	echo "musl staticlib OK ($(du -h "$a" \| cut -f1))"

	test:
	needs: lint
	timeout-minutes: 15
	strategy:
	# Each OS hits its own runner-image regressions (macos-latest currently
	# ships Homebrew rustup without `cargo`/`rustc` shims under
	# `~/.cargo/bin`, so plain `cargo …` resolves to `rustup-init`). Run
	# every OS so one glitchy image doesn't mask the others.
	fail-fast: false
	matrix:
	os: [ubuntu-latest, windows-latest, macos-latest]
	runs-on: ${{ matrix.os }}
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	- name: Prepend toolchain bin to PATH (macos shim workaround)
	if: runner.os == 'macOS'
	# macos-latest preinstalled rustup ships via Homebrew without the
	# `~/.cargo/bin/{cargo,rustc}` proxy shims. `rustup run stable cargo`
	# works for the outer call but the cargo it launches then invokes
	# `rustc -vV` through the same broken proxy. Putting the toolchain's
	# actual `bin/` ahead on PATH gives every nested invocation the
	# real binaries.
	run: \|
	TC="$(rustup show active-toolchain \| awk '{print $1}')"
	echo "$HOME/.rustup/toolchains/$TC/bin" >> $GITHUB_PATH
	- uses: taiki-e/install-action@nextest
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: ${{ runner.os }}-cargo
	- name: Test
	working-directory: zstd
	run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder
	- name: Doc tests
	run: cargo test --doc -p structured-zstd --features hash,std,dict_builder

	cross-i686:
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	- uses: taiki-e/install-action@nextest
	- name: Install i686 target
	run: rustup target add i686-unknown-linux-gnu
	- name: Install 32-bit libs
	run: sudo apt-get update && sudo apt-get install -y gcc-multilib
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: cross-i686
	- name: Test (i686)
	working-directory: zstd
	run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder --target i686-unknown-linux-gnu

	msrv:
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	with:
	toolchain: "1.92.0"
	- uses: taiki-e/install-action@nextest
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: msrv
	- name: Test (MSRV)
	working-directory: zstd
	run: cargo nextest run --profile ci -p structured-zstd --features hash,std,dict_builder

	no-std:
	# Build-only smoke test that the crate compiles under no_std + alloc.
	# The decoder ships #![no_std] with optional `std` feature gating the
	# io::Read/Write impls + runtime CPUID detection; without this gate
	# nothing catches a regression where a new `use std::...` slips into
	# a hot decode path and silently breaks downstream embedded users.
	# Two configurations cover the realistic deployment surface:
	# alloc-only (zero features) and alloc + xxhash content checksum.
	# Clippy runs cargo's compile checks plus linting, so a separate
	# `cargo check` pass would just duplicate work.
	needs: lint
	timeout-minutes: 10
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	with:
	components: clippy
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: no-std
	- name: Clippy (no_std)
	working-directory: zstd
	run: cargo clippy -p structured-zstd --no-default-features -- -D warnings
	- name: Clippy (no_std + hash)
	working-directory: zstd
	run: cargo clippy -p structured-zstd --no-default-features --features hash -- -D warnings
	# Embedded-minimal kernel trim: scalar kernel only, all SIMD tiers and
	# their dispatch trampolines compiled out. Guards the kernel_* feature
	# gating so a future change can't silently break the trimmed build.
	- name: Clippy (embedded — kernel_scalar only)
	working-directory: zstd
	run: cargo clippy -p structured-zstd --no-default-features --features kernel_scalar,hash -- -D warnings
	- name: Clippy (embedded_minimal — kernel_scalar + std)
	working-directory: zstd
	run: cargo clippy -p structured-zstd --no-default-features --features kernel_scalar,std,hash -- -D warnings

	wasm:
	# #347 / #348: the wasm32 simd128 kernel tier + the npm package
	# (@structured-world/structured-zstd). Builds both payloads (simd128 +
	# scalar), then runs the Node format cross-check against the C reference
	# (@bokuweb/zstd-wasm): our frames decode there and vice versa. `+simd128`
	# is scoped to the wasm32 target via `--config` so host build-scripts don't
	# warn. Cheap (wasm build + node), so it runs on every PR.
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	with:
	# This job builds + tests only (no push); don't leave the token in
	# the checkout's git config.
	persist-credentials: false
	- uses: dtolnay/rust-toolchain@stable
	with:
	targets: wasm32-unknown-unknown
	components: clippy
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: wasm
	- name: Clippy (wasm32 simd128)
	run: cargo clippy -p structured-zstd -p structured-zstd-wasm --target wasm32-unknown-unknown --no-default-features --features kernel_simd128 --config 'target.wasm32-unknown-unknown.rustflags=["-C","target-feature=+simd128"]' -- -D warnings
	- name: Clippy (wasm32 scalar)
	run: cargo clippy -p structured-zstd -p structured-zstd-wasm --target wasm32-unknown-unknown --no-default-features --features kernel_simd128 -- -D warnings
	- name: Install wasm-pack
	uses: taiki-e/install-action@v2
	with:
	tool: wasm-pack
	- uses: actions/setup-node@v6
	with:
	node-version: "24"
	- name: Build npm package (both payloads + types)
	working-directory: zstd-wasm/npm
	run: \|
	npm ci \|\| npm install
	npm run build
	- name: Format cross-check vs C reference (@bokuweb/zstd-wasm)
	working-directory: zstd-wasm/bench
	run: \|
	npm ci \|\| npm install
	npm test
	- name: wasm size budget
	run: \|
	for p in simd scalar; do
	sz=$(stat -c %s "zstd-wasm/npm/$p/structured_zstd_wasm_bg.wasm")
	echo "$p payload: $sz bytes"
	# Ceiling (~1 MiB) over the ~550 KiB baseline; bumped from 768 KiB
	# when the per-tier match kernels (monomorphised per MLS + the
	# simd128/scalar count tiers) legitimately grew the module, with
	# headroom for the remaining backends gaining simd128 tiers.
	if [ "$sz" -gt 1048576 ]; then
	echo "::error::$p .wasm is $sz bytes, over the 1 MiB budget"
	exit 1
	fi
	done

	bench-wasm:
	# #366: continuous wasm dashboard shard. The native bench matrix runs
	# a prebuilt criterion binary (Rust vs C FFI); this shard instead
	# builds the npm payloads and runs `node zstd-wasm/bench/bench.mjs`,
	# which measures our two wasm tiers (simd128 + scalar) against the
	# most popular npm competitor (@bokuweb/zstd-wasm). Its REPORT* lines
	# feed a dedicated wasm section on the gh-pages dashboard so the
	# wasm speed/ratio vs bokuweb is tracked over time, not just checked
	# locally before an npm publish.
	#
	# Push-to-main only (consistent with the rest of the bench pipeline
	# being push-only — #362) and gated on `rust_core`, the SAME gate as
	# the native `bench-matrix` cascade: the wasm payload compiles the core
	# crate to wasm32, so its numbers move when the Rust compressor changes,
	# not when the wasm-crate / npm glue changes. It publishes only
	# `benchmark-wasm.json` to gh-pages — `index.html` stays owned by
	# `benchmark-pages` / `pages-only.yml` — so the two publishers write
	# disjoint files and never race for the same blob.
	name: Bench wasm vs bokuweb (dashboard shard)
	needs: [lint, changes]
	if: github.event_name == 'workflow_dispatch' \|\| (github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.changes.outputs.rust_core == 'true')
	timeout-minutes: 30
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	- uses: dtolnay/rust-toolchain@stable
	with:
	targets: wasm32-unknown-unknown
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: wasm-bench
	- name: Install wasm-pack
	uses: taiki-e/install-action@v2
	with:
	tool: wasm-pack
	- uses: actions/setup-node@v6
	with:
	node-version: "24"

	# `wasm-pack build --release` runs the `-Oz` wasm-opt pass, for which it
	# downloads a pinned binaryen tarball from GitHub releases at build time.
	# That download is network-flaky (transient 5xx / rate-limit) and was
	# failing the whole dashboard shard. Cache wasm-pack's tool dir so the
	# tarball is fetched at most once across runs.
	- name: Cache wasm-pack binaryen download
	uses: actions/cache@v5
	with:
	path: ~/.cache/.wasm-pack
	key: wasm-pack-binaryen-${{ runner.os }}

	- name: Build npm payloads (simd128 + scalar)
	working-directory: zstd-wasm/npm
	run: \|
	npm ci \|\| npm install
	# Retry the build: on a cold binaryen cache the wasm-opt tarball
	# download can transiently fail. Hard-fail after 3 attempts so a real
	# (non-transient) breakage still fails the shard.
	attempt=0
	until npm run build; do
	attempt=$((attempt + 1))
	if [ "$attempt" -ge 3 ]; then
	echo "wasm build failed after $attempt attempts" >&2
	exit 1
	fi
	echo "wasm build attempt $attempt failed (likely transient binaryen fetch); retrying..." >&2
	sleep 15
	done

	- name: Run wasm bench (capture REPORT* lines)
	working-directory: zstd-wasm/bench
	# `pipefail` + `tee`: bench.mjs exits non-zero on any round-trip
	# failure, which must fail the shard. `tee` keeps the captured
	# stdout for the parser regardless of pass/fail.
	run: \|
	npm ci \|\| npm install
	set -o pipefail
	node bench.mjs \| tee "$GITHUB_WORKSPACE/wasm-bench-raw.txt"

	- name: Parse REPORT* lines into dashboard records
	env:
	WASM_BENCH_RAW_FILE: ${{ github.workspace }}/wasm-bench-raw.txt
	STRUCTURED_ZSTD_BENCH_GENERATED_AT: ${{ github.event.head_commit.timestamp \|\| github.event.repository.updated_at }}
	run: python3 .github/scripts/parse-wasm-bench.py

	- name: Upload wasm bench artifact
	uses: actions/upload-artifact@v7
	with:
	name: benchmark-wasm-run
	path: benchmark-wasm-run.json
	if-no-files-found: error
	retention-days: 7

	- name: Generate bot token
	id: bot-token
	uses: actions/create-github-app-token@v3
	with:
	app-id: ${{ secrets.RELEASER_APP_ID }}
	private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}

	- name: Checkout gh-pages with push token
	uses: actions/checkout@v6
	with:
	ref: gh-pages
	token: ${{ steps.bot-token.outputs.token }}
	path: gh-pages

	- name: Merge into persisted wasm timeseries and publish
	env:
	WASM_RUN_FILE: ${{ github.workspace }}/benchmark-wasm-run.json
	WASM_EXISTING_FILE: ${{ github.workspace }}/gh-pages/dev/bench/benchmark-wasm.json
	WASM_OUTPUT_FILE: ${{ github.workspace }}/gh-pages/dev/bench/benchmark-wasm.json
	run: \|
	mkdir -p gh-pages/dev/bench
	python3 .github/scripts/merge-wasm-bench.py
	cd gh-pages
	git config user.name "github-actions[bot]"
	git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
	git add dev/bench/benchmark-wasm.json
	if git diff --cached --quiet; then
	echo "No wasm timeseries change to publish."
	exit 0
	fi
	git commit -m "chore(bench): publish wasm vs bokuweb dashboard data"
	# Disjoint-file publishers can still collide on a concurrent
	# gh-pages push; rebase onto the latest remote tip and retry a
	# few times before giving up.
	for attempt in 1 2 3; do
	if git push origin gh-pages; then
	echo "Published on attempt $attempt."
	exit 0
	fi
	echo "Push rejected (attempt $attempt); rebasing on remote gh-pages."
	git pull --rebase origin gh-pages
	done
	echo "::error::failed to publish wasm timeseries after 3 attempts"
	exit 1

	codecov:
	needs: lint
	timeout-minutes: 15
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@nightly
	with:
	components: llvm-tools-preview
	- uses: Swatinem/rust-cache@v2
	- uses: taiki-e/install-action@cargo-llvm-cov
	- run: cargo llvm-cov -p structured-zstd --features hash,std,dict_builder --lcov --output-path lcov.info
	working-directory: zstd
	- uses: codecov/codecov-action@v7
	with:
	files: zstd/lcov.info
	env:
	CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

	fuzz:
	# Short-budget libFuzzer smoke run across all targets in a single
	# runner. The asan+sancov sub-build of `structured-zstd` is the
	# dominant cost (~2 min); running the five targets sequentially in
	# one job amortises that build (cargo cache reuse drops targets
	# 2..5 to a few seconds each) and halves the compute vs a per-
	# target matrix. Wall-clock stays bounded by the bench matrix,
	# which runs in parallel and takes far longer.
	#
	# The repo ships a regression corpus in `zstd/fuzz/artifacts/`;
	# for every target cargo-fuzz replays that corpus first (any old
	# crash that resurfaces fails the job), then runs
	# `-max_total_time` seconds of fresh fuzzing on top.
	name: Fuzz smoke
	needs: lint
	timeout-minutes: 20
	runs-on: ubuntu-latest
	env:
	# Override `rust-toolchain.toml` (which pins stable) so `cargo fuzz`
	# — which requires `-Z sanitizer` from nightly — gets a nightly
	# compiler inside the fuzz crate sub-build.
	RUSTUP_TOOLCHAIN: nightly
	# Explicit target: the prebuilt `cargo-fuzz` binary installed via
	# taiki-e/install-action is statically linked against musl and its
	# `default_target()` probe picks `x86_64-unknown-linux-musl`,
	# which fails because libFuzzer's AddressSanitizer cannot link
	# against a static libc (`sanitizer is incompatible with
	# statically linked libc`). Pinning the gnu target sidesteps the
	# probe and matches the toolchain rustc actually has stdlib for.
	FUZZ_TARGET_TRIPLE: x86_64-unknown-linux-gnu
	# Single source of truth for the fuzz target inventory; both the
	# corpus replay step and the fresh-fuzz step iterate over this.
	FUZZ_TARGETS: "decode encode interop huff0 fse"
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@nightly
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: fuzz
	workspaces: zstd/fuzz
	- name: Install cargo-fuzz
	uses: taiki-e/install-action@v2
	with:
	tool: cargo-fuzz
	- name: Replay regression corpus
	# Drive the existing crash artifacts through each target so
	# any reintroduction of a previously-fixed bug fails CI on
	# the same input that originally surfaced it.
	working-directory: zstd/fuzz
	run: \|
	for target in $FUZZ_TARGETS; do
	if [ -d "artifacts/$target" ] && [ -n "$(ls artifacts/$target 2>/dev/null)" ]; then
	echo "Replaying $(ls artifacts/$target \| wc -l) regression artifacts for $target"
	cargo fuzz run --target "$FUZZ_TARGET_TRIPLE" "$target" artifacts/"$target"/*
	else
	echo "No regression artifacts for $target — skipping replay"
	fi
	done
	- name: Fuzz (90s per target)
	working-directory: zstd/fuzz
	# `-max_total_time` is libFuzzer's own time cap; on top of
	# the GitHub Actions timeout-minutes it gives us a hard
	# ceiling even if a target wedges in setup.
	run: \|
	for target in $FUZZ_TARGETS; do
	echo "::group::Fuzz $target"
	cargo fuzz run --target "$FUZZ_TARGET_TRIPLE" "$target" -- -max_total_time=90 -timeout=30
	echo "::endgroup::"
	done
	- name: Upload new crash artifacts on failure
	if: failure()
	uses: actions/upload-artifact@v7
	with:
	name: fuzz-artifacts
	path: zstd/fuzz/artifacts/
	if-no-files-found: ignore
	retention-days: 14

	bench-matrix:
	# Canonical bench target inventory. Every downstream bench job
	# (`bench-build`, `benchmark`, `benchmark-aggregate`,
	# `benchmark-pages`) consumes this so a new target ID is added
	# in exactly one place. `build_setup` runs only on `bench-build`;
	# `runtime_setup` runs only on the bench shards (e.g. `i686-gnu`
	# needs gcc-multilib's 32-bit loader at runtime, while `x86_64-musl`
	# only needs `musl-tools` for the build).
	#
	# The bench pipeline runs ONLY on push-to-main, never on PRs (#362).
	# Benchmarks on GitHub free runners (~6-8x slower than a dev box) cost
	# ~20 min per (target, L22) shard, and the pre-merge regression gate it
	# used to feed added runner load for little signal. `main` is the
	# immutable historical record: the `dev/bench` dashboard surfaces perf
	# post-merge, which is where it gets published. This single gate
	# cascades through `needs:` so EVERY downstream bench job
	# (`bench-build`, `benchmark`, `benchmark-aggregate`, `benchmark-pages`)
	# stays green-skipped on PRs without each needing its own filter.
	# See #164 (sharding) and #362 (push-only gate).
	#
	# Second gate: `needs.changes.outputs.rust_core`. A push to main that
	# touched ONLY wasm / npm / ts / js / CI / docs cannot move the
	# Rust-vs-FFI numbers, so the bench pipeline is skipped on it (the
	# published dashboard snapshot stays the last Rust-core baseline).
	# Same cascade — skipping `bench-matrix` skips every downstream bench job.
	name: Resolve bench target matrix
	needs: [lint, changes]
	if: github.event_name == 'push' && github.ref == 'refs/heads/main' && needs.changes.outputs.rust_core == 'true'
	runs-on: ubuntu-latest
	outputs:
	targets: ${{ steps.set.outputs.targets }}
	ids_csv: ${{ steps.set.outputs.ids_csv }}
	shards: ${{ steps.set.outputs.shards }}
	shards_csv: ${{ steps.set.outputs.shards_csv }}
	steps:
	- id: set
	env:
	# Drives the shard plan below. On a `pull_request` we run
	# only the two canonical levels (`level_3_dfast` = donor
	# default, `level_22_btultra2` = max compression) bundled
	# into a single shard per target — three shards total, cheap
	# PR feedback. On a `push: main` (post-merge), one shard per
	# strategy group runs — nine groups (fast split into
	# `fast-neg` / `fast-pos`; lazy split into `lazy-lower` /
	# `lazy-upper` to keep the worst-case per-shard wall under
	# the 120-min CI cap) × three targets = 27 shards, so the
	# published gh-pages snapshot keeps full coverage for the
	# dashboard + tagged baselines (#164).
	EVENT_NAME: ${{ github.event_name }}
	run: \|
	cat > targets.json <<'EOF'
	[
	{
	"id": "x86_64-gnu",
	"target_triple": "x86_64-unknown-linux-gnu",
	"build_setup": "",
	"runtime_setup": "",
	"timeout_minutes": 120
	},
	{
	"id": "i686-gnu",
	"target_triple": "i686-unknown-linux-gnu",
	"build_setup": "sudo apt-get update && sudo apt-get install -y gcc-multilib libc6-dev-i386",
	"runtime_setup": "sudo apt-get update && sudo apt-get install -y libc6-dev-i386",
	"timeout_minutes": 120
	},
	{
	"id": "x86_64-musl",
	"target_triple": "x86_64-unknown-linux-musl",
	"build_setup": "sudo apt-get update && sudo apt-get install -y musl-tools",
	"runtime_setup": "",
	"timeout_minutes": 120
	}
	]
	EOF
	targets_compact=$(jq -c . targets.json)
	ids_csv=$(jq -r '[.[].id] \| join(",")' targets.json)
	echo "targets=$targets_compact" >> "$GITHUB_OUTPUT"
	echo "ids_csv=$ids_csv" >> "$GITHUB_OUTPUT"
	echo "Bench targets: $ids_csv"

	# Shard plan: each entry runs a comma-separated set of
	# levels through one bench binary invocation via the
	# `STRUCTURED_ZSTD_BENCH_LEVEL_FILTER` env var. `id` drives
	# the artifact name (`benchmark-shard-<target>-<id>`) and
	# the per-file suffix in the markdown / JSON outputs.
	#
	# PR event = single shard covering the two canonical levels
	# so reviewers see ratio + speed + memory deltas on the
	# default-level path (level_3_dfast) and the max-compression
	# path (level_22_btultra2) within minutes. Strategy groups
	# mirror `clevels.h` + `StrategyTag::for_level` so an
	# entire strategy's levels share a runner — keeps per-job
	# build overhead amortised across the levels of that family.
	if [ "$EVENT_NAME" = "pull_request" ]; then
	cat > shards.json <<'EOF'
	[
	{
	"id": "pr-canonical",
	"levels": "level_3_dfast,level_22_btultra2"
	}
	]
	EOF
	else
	cat > shards.json <<'EOF'
	[
	{
	"id": "fast-neg",
	"levels": "level_-7_fast,level_-6_fast,level_-5_fast,level_-4_fast,level_-3_fast,level_-2_fast"
	},
	{
	"id": "fast-dfast",
	"levels": "level_-1_fast,level_1_fast,level_2_fast,level_3_dfast,level_4_dfast,level_5_greedy"
	},
	{
	"id": "lazy-lower",
	"levels": "level_6_lazy,level_7_lazy,level_8_lazy,level_9_lazy,level_10_lazy"
	},
	{
	"id": "lazy-upper",
	"levels": "level_11_lazy,level_12_lazy,level_13_lazy,level_14_lazy,level_15_lazy"
	},
	{
	"id": "btopt",
	"levels": "level_16_btopt,level_17_btopt,level_1_fast_ldm,level_22_btultra2_ldm,level_1_fast_ldm_dict,level_22_btultra2_ldm_dict"
	},
	{
	"id": "btultra2",
	"levels": "level_18_btultra,level_19_btultra2,level_20_btultra2,level_21_btultra2,level_22_btultra2"
	}
	]
	EOF
	fi
	shards_compact=$(jq -c . shards.json)
	shards_csv=$(jq -r '[.[].id] \| join(",")' shards.json)
	echo "shards=$shards_compact" >> "$GITHUB_OUTPUT"
	echo "shards_csv=$shards_csv" >> "$GITHUB_OUTPUT"
	echo "Bench shards ($EVENT_NAME): $shards_csv"

	bench-build:
	# Build the criterion `compare_ffi` binary once per target. Every
	# downstream bench shard (target × level) downloads the binary
	# via `bench-binary-<target>` artifact and runs it directly — no
	# rebuild per shard. Saves ~4-7 min on each of the 18 shard
	# runners.
	name: Build bench binary (${{ matrix.bench.id }})
	needs: [lint, bench-matrix]
	timeout-minutes: 20
	strategy:
	fail-fast: false
	matrix:
	bench: ${{ fromJSON(needs.bench-matrix.outputs.targets) }}
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	- uses: dtolnay/rust-toolchain@stable
	- name: Install benchmark target
	run: rustup target add ${{ matrix.bench.target_triple }}
	- name: Install build toolchain dependencies
	if: matrix.bench.build_setup != ''
	run: ${{ matrix.bench.build_setup }}
	- uses: Swatinem/rust-cache@v2
	with:
	prefix-key: bench-${{ matrix.bench.id }}
	- name: Build compare_ffi + compare_ffi_memory bench binaries
	env:
	CC_x86_64_unknown_linux_musl: musl-gcc
	# The donor zstd-sys C library uses runtime feature detection
	# (is_x86_feature_detected!-equivalent) so it transparently
	# picks up BMI2/AVX2/etc. on the runner. Pure-Rust hot paths
	# gate intrinsics on COMPILE-time cfg!(target_feature = ...)
	# and the default rustc x86_64 target ships with SSE2 only.
	# Without explicit target selection the bench compares
	# "donor with full ISA" vs "us with SSE2 baseline" — not
	# apples-to-apples.
	#
	# Use a DETERMINISTIC baseline (x86-64-v3 = BMI2 + AVX2 +
	# everything in the Haswell ISA, the 2013+ x86_64 baseline)
	# ONLY for x86_64 targets via target.<triple>.rustflags. NOT
	# target-cpu=native: that picks whatever CPU the BUILD runner
	# has, which (a) varies across github-runners, (b) crashes
	# with SIGILL when a bench shard runner lacks features the
	# build runner had, and (c) is meaningless for cross-compile
	# targets like i686-unknown-linux-gnu.
	#
	# i686 / non-x86 / musl targets keep the default rustc
	# baseline. Measured +8.5% on
	# decompress/level_-1_fast/decodecorpus-z000033/c_stream on
	# i9-9900K; the win comes from _bzhi_u64-backed
	# mask_lower_bits in the FSE state-update hot path.
	CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: "-C target-cpu=x86-64-v3"
	CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_RUSTFLAGS: "-C target-cpu=x86-64-v3"
	run: \|
	# `--no-run` builds without executing; `--message-format=json`
	# exposes the resolved binary path in the build log so we
	# can ship just the executable to shards. Building BOTH
	# bench binaries in one cargo invocation reuses dependency
	# compilation between them (zstd-sys, criterion, etc.).
	cargo bench --bench compare_ffi --bench compare_ffi_memory \
	-p structured-zstd --features dict_builder \
	--target ${{ matrix.bench.target_triple }} --no-run \
	--message-format=json > build.log
	mkdir -p bench-binary
	for name in compare_ffi compare_ffi_memory; do
	bin_path=$(jq -r --arg n "$name" 'select(.executable != null and (.target.name == $n)) \| .executable' build.log \| tail -1)
	if [ -z "$bin_path" ] \|\| [ ! -x "$bin_path" ]; then
	echo "ERROR: failed to locate $name binary in cargo output" >&2
	cat build.log \| jq -r 'select(.executable != null) \| "\(.target.name) \(.executable)"' >&2
	exit 1
	fi
	cp "$bin_path" "bench-binary/$name"
	chmod +x "bench-binary/$name"
	echo "$name size: $(wc -c < bench-binary/$name) bytes"
	done
	- name: Upload bench binary
	uses: actions/upload-artifact@v7
	with:
	name: bench-binary-${{ matrix.bench.id }}
	path: bench-binary/
	if-no-files-found: error
	retention-days: 7

	benchmark:
	name: Bench ${{ matrix.bench.id }} / ${{ matrix.shard.id }}
	needs: [bench-build, bench-matrix]
	timeout-minutes: ${{ matrix.bench.timeout_minutes }}
	strategy:
	# Matrix split target × level. The pre-built binary from
	# `bench-build` is what each shard executes, so the runtime
	# budget per shard is purely the criterion measurement +
	# post-processing. `level22` on i686 is still the natural
	# bottleneck (~20 min); every other (target, level) combo
	# finishes well under 10 min.
	fail-fast: false
	matrix:
	bench: ${{ fromJSON(needs.bench-matrix.outputs.targets) }}
	# Shard plan is resolved in `bench-matrix.outputs.shards`.
	# Each shard owns one strategy-grouped level bundle (PR runs
	# a single `pr-canonical` shard with level_3 + level_22; main
	# runs nine strategy groups — see #164 for the fast/lazy
	# split rationale). `shard.levels` is a CSV that we forward
	# into `STRUCTURED_ZSTD_BENCH_LEVEL_FILTER` so the bench
	# binary iterates the requested levels in one process.
	shard: ${{ fromJSON(needs.bench-matrix.outputs.shards) }}
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6
	# Needed for: corpus files referenced via `env!("CARGO_MANIFEST_DIR")`
	# inside the bench binary, the run-benchmarks.sh script, and
	# the python post-processor.

	- name: Install target runtime dependencies
	if: matrix.bench.runtime_setup != ''
	run: ${{ matrix.bench.runtime_setup }}

	- name: Download pre-built bench binary
	uses: actions/download-artifact@v8
	with:
	name: bench-binary-${{ matrix.bench.id }}
	path: bench-binary
	- name: Mark bench binaries executable
	# `actions/download-artifact` strips the executable bit
	# (downloaded files land as mode 0644). Both binaries shipped
	# in the artifact need +x — the memory binary's `-x` check in
	# run-benchmarks.sh would otherwise reject it on main pushes.
	run: \|
	chmod +x bench-binary/compare_ffi
	if [ -f bench-binary/compare_ffi_memory ]; then
	chmod +x bench-binary/compare_ffi_memory
	fi

	- name: Run benchmarks (filtered to shard's levels)
	env:
	STRUCTURED_ZSTD_BENCH_TARGET: ${{ matrix.bench.id }}
	STRUCTURED_ZSTD_BENCH_TRIPLE: ${{ matrix.bench.target_triple }}
	STRUCTURED_ZSTD_BENCH_GENERATED_AT: ${{ github.event_name == 'pull_request' && github.event.pull_request.updated_at \|\| github.event.head_commit.timestamp \|\| github.event.repository.updated_at }}
	STRUCTURED_ZSTD_BENCH_LEVEL_FILTER: ${{ matrix.shard.levels }}
	# run-benchmarks.sh: re-exec this binary instead of `cargo bench`.
	STRUCTURED_ZSTD_BENCH_BIN: ${{ github.workspace }}/bench-binary/compare_ffi
	# Memory bench runs only on main pushes — its TrackingAllocator
	# measures peak alloc bytes precisely but adds per-allocation
	# overhead, so we don't want it on every PR review cycle. On
	# main pushes (`event_name == 'push'`) the second binary is
	# invoked sequentially by run-benchmarks.sh and its REPORT_MEM
	# lines feed the dashboard's `peak_alloc_bytes` metric.
	STRUCTURED_ZSTD_BENCH_MEMORY_BIN: ${{ github.event_name == 'push' && format('{0}/bench-binary/compare_ffi_memory', github.workspace) \|\| '' }}
	# The prebuilt bench binary is launched directly (not via cargo),
	# so `env::var("CARGO_MANIFEST_DIR")` returns None inside it.
	# Without this override, `load_decode_corpus_scenario()` falls
	# back to the synthetic 1 MiB corpus and the bench label silently
	# flips from `decodecorpus-z000033` to `decodecorpus-synthetic-1m`,
	# making dashboards diverge from a baseline produced via
	# `cargo bench`. Point the binary at the checkout's real fixture.
	STRUCTURED_ZSTD_BENCH_CORPUS_PATH: ${{ github.workspace }}/zstd/decodecorpus_files/z000033
	run: bash .github/scripts/run-benchmarks.sh

	- name: Rename benchmark outputs for matrix artifact
	run: \|
	mv benchmark-results.json benchmark-results.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	mv benchmark-report.md benchmark-report.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	mv benchmark-delta.json benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	mv benchmark-delta.md benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	mv benchmark-relative.json benchmark-relative.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json

	- name: Upload benchmark shard artifacts
	uses: actions/upload-artifact@v7
	with:
	name: benchmark-shard-${{ matrix.bench.id }}-${{ matrix.shard.id }}
	path: \|
	benchmark-results.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	benchmark-report.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	benchmark-delta.${{ matrix.bench.id }}.${{ matrix.shard.id }}.md
	benchmark-relative.${{ matrix.bench.id }}.${{ matrix.shard.id }}.json
	if-no-files-found: error
	# Intermediate inputs to `benchmark-aggregate`; match the
	# 7-day retention used for `bench-binary-*`.
	retention-days: 7

	benchmark-aggregate:
	name: Aggregate benchmark shards per target
	needs: [benchmark, bench-matrix]
	timeout-minutes: 10
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	- name: Generate bot token
	id: bot-token
	if: github.event_name == 'push' \|\| github.event.pull_request.head.repo.full_name == github.repository
	uses: actions/create-github-app-token@v3
	with:
	app-id: ${{ secrets.RELEASER_APP_ID }}
	private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}

	- name: Download benchmark shard artifacts
	uses: actions/download-artifact@v8
	with:
	pattern: benchmark-shard-*
	path: benchmark-artifacts

	- name: Aggregate level shards into per-target files
	env:
	AGGREGATE_TARGETS: ${{ needs.bench-matrix.outputs.ids_csv }}
	run: python3 .github/scripts/aggregate-bench-levels.py

	- name: Upload aggregated benchmark artifact
	# Single combined artifact carrying per-target consolidated
	# files. `merge-benchmarks.py` rglob's the download root so
	# it picks them up regardless of subdir layout; this lets
	# bench-pages download one artifact instead of one per target.
	uses: actions/upload-artifact@v7
	with:
	name: benchmark-aggregated
	path: \|
	benchmark-results.*.json
	benchmark-report.*.md
	benchmark-delta.*.json
	benchmark-delta.*.md
	benchmark-relative.*.json
	if-no-files-found: error
	retention-days: 7

	# Save baseline (main push only). Intentionally NO `fail-on-alert`
	# and NO `comment-on-alert` — this step only records the new baseline
	# for the dashboard. The pre-merge PR regression gate was removed in
	# #362 (it forced the full bench matrix on every PR for little signal);
	# perf is now surfaced post-merge via the `dev/bench` dashboard. Adding
	# an alert/fail path here would re-create the stuck-baseline cascade
	# from #158: a regression on main push would fail the step before
	# `save-data-file` ran, freezing the baseline indefinitely.
	- name: Save benchmark baseline (main push only)
	if: steps.bot-token.outputs.token != '' && github.event_name == 'push' && github.ref == 'refs/heads/main'
	uses: benchmark-action/github-action-benchmark@v1
	with:
	name: "structured-zstd vs C FFI (x86_64-gnu)"
	tool: customSmallerIsBetter
	output-file-path: benchmark-results.x86_64-gnu.json
	github-token: ${{ steps.bot-token.outputs.token }}
	auto-push: true
	save-data-file: true
	comment-on-alert: false
	fail-on-alert: false
	alert-threshold: "130%"
	benchmark-data-dir-path: dev/bench

	benchmark-pages:
	name: Publish benchmark pages payloads
	needs: benchmark-aggregate
	# Push-to-main only — the whole bench pipeline is gated there now (#362);
	# the PR branch of this filter was dead once `bench-matrix` stopped
	# running on PRs (cascade-skip), and PR runs never published anyway.
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	timeout-minutes: 20
	runs-on: ubuntu-latest
	steps:
	- uses: actions/checkout@v6

	- name: Generate bot token
	id: bot-token
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	uses: actions/create-github-app-token@v3
	with:
	app-id: ${{ secrets.RELEASER_APP_ID }}
	private-key: ${{ secrets.RELEASER_APP_PRIVATE_KEY }}

	- name: Checkout gh-pages with push token
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	uses: actions/checkout@v6
	with:
	ref: gh-pages
	token: ${{ steps.bot-token.outputs.token }}
	path: gh-pages

	# Only the aggregated per-target files feed merge-benchmarks.py.
	# The level shards (benchmark-shard-*) are intermediate inputs
	# to `benchmark-aggregate` and would otherwise pollute
	# merge-benchmarks.py's per-target name extraction with
	# `<target>.<level>` keys, so we download just the single
	# combined `benchmark-aggregated` artifact here.
	- uses: actions/download-artifact@v8
	with:
	name: benchmark-aggregated
	path: benchmark-artifacts

	- name: Merge relative/delta payloads
	run: python3 .github/scripts/merge-benchmarks.py

	- name: Publish benchmark reports to gh-pages
	if: github.event_name == 'push' && github.ref == 'refs/heads/main'
	run: \|
	mkdir -p gh-pages/dev/bench
	cp merged/benchmark-report.md gh-pages/dev/bench/benchmark-report.md
	cp merged/benchmark-delta.json gh-pages/dev/bench/benchmark-delta.json
	cp merged/benchmark-delta.md gh-pages/dev/bench/benchmark-delta.md
	cp merged/benchmark-summary.md gh-pages/dev/bench/benchmark-summary.md
	cp merged/benchmark-delta-summary.md gh-pages/dev/bench/benchmark-delta-summary.md
	cp merged/benchmark-relative.json gh-pages/dev/bench/benchmark-relative.json
	cp .github/bench-dashboard/index.html gh-pages/dev/bench/index.html
	cd gh-pages
	git config user.name "github-actions[bot]"
	git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
	git add dev/bench/index.html dev/bench/benchmark-report.md dev/bench/benchmark-delta.json dev/bench/benchmark-delta.md dev/bench/benchmark-relative.json dev/bench/benchmark-summary.md dev/bench/benchmark-delta-summary.md
	git diff --cached --quiet \|\| git commit -m "chore(bench): publish benchmark reports"
	git push origin gh-pages

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

perf(hc): lazy-skip + separate dictMatchState beat C on small-window dict compress #2051

Workflow file

perf(hc): lazy-skip + separate dictMatchState beat C on small-window dict compress #2051

Uh oh!

Workflow file for this run