-
Notifications
You must be signed in to change notification settings - Fork 4
468 lines (439 loc) · 20.3 KB
/
Copy pathci.yml
File metadata and controls
468 lines (439 loc) · 20.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
name: CI
# ExpressGateway CI — every per-PR/push BLOCKING gate, in one workflow.
#
# S40 consolidation: the old `ci.yml` (fast checks + build/test) and
# `prod-readiness-gates.yml` (the D-gates that need a real CI environment) were
# two separate blocking workflows that both fired on every PR/push to main. They
# are merged here into one coherent CI workflow, grouped into sections. NO gate
# was dropped — see audit/release/s40-gate-map.md for the before->after mapping
# (every old job -> its new home) and the required-status-check rename list.
#
# Shared setup (toolchain + cache + system-deps) is single-sourced via the
# ./.github/actions/rust-setup composite (R12). Weekly informational scans live
# in scheduled.yml; tag-triggered build/publish + the soak release gate live in
# release.yml.
on:
push:
branches: [main]
pull_request:
workflow_dispatch:
env:
CARGO_TERM_COLOR: always
RUSTFLAGS: -D warnings
# MSRV — matches rust-toolchain.toml (moved 1.85 -> 1.88 at S31 for quiche
# 0.29.1 + tokio-quiche 0.19, which hard-require Rust 1.88).
RUST_MSRV: "1.88"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
# =================================================================
# SECTION 1 — fast checks (format, compile, lint, doc + panic guards)
# =================================================================
fmt:
name: Format
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
components: rustfmt
cache: 'false'
- run: cargo fmt --all -- --check
check:
name: Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
# --all-features is REQUIRED: the R8 memory integration tests in tests/
# read lb-l7's `#[cfg(feature = "test-gauges")]` gauges, forwarded by the
# root crate's `test-gauges` feature (off by default). The canonical
# session gate has always built with --all-features.
- run: cargo check --workspace --all-targets --all-features
clippy:
name: Clippy
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
components: clippy
- run: cargo clippy --workspace --all-targets --all-features -- -D warnings
panic-freedom:
name: Panic Freedom Audit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Verify panic-freedom deny lints are present in every library crate
run: |
MISSING=""
for lib in crates/*/src/lib.rs; do
# Match the deny attribute even when it spans multiple lines
# (e.g. `#![deny(\n clippy::unwrap_used, ...\n)]`).
if ! grep -Pzoq '#!\[deny\([^)]*clippy::unwrap_used' "$lib" 2>/dev/null; then
MISSING="$MISSING\n $lib"
fi
done
if [ -n "$MISSING" ]; then
echo "::error::Crates missing panic-freedom deny lints:$MISSING"
exit 1
fi
echo "All library crates have panic-freedom deny lints."
doc-lint:
# Guards operator-facing docs against drift (tier-1 stale patterns) AND
# verifies every `Status: Verified-Fixed(<sha>)` audit claim resolves to a
# SHA whose diff actually closes the recommendation (tier-2 audit-of-audit).
name: Doc Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
with:
# Tier-2 walks `git show` / `git ls-tree` against historical SHAs
# cited in Verified-Fixed claims — needs full history.
fetch-depth: 0
- run: bash scripts/ci/doc-lint.sh
# =================================================================
# SECTION 2 — build & test (suite, MSRV, fuzz smoke, release codegen)
# =================================================================
test:
name: Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
# The --all-features workspace test build links many large integration-test
# binaries; on a default hosted runner it can hit ENOSPC (esp. with a warm
# rust-cache restored). Reclaim ~15-20 GB first (the same step coverage
# uses). Test assertions are unchanged — this only frees runner disk.
- uses: ./.github/actions/free-disk
# Mirror the canonical session gate (--all-features = test-gauges) +
# --no-fail-fast. The heavy real-wire e2e binaries (grpc_h3_e2e, ws_*)
# self-serialize via an in-file `static SUITE_SERIAL` tokio Mutex.
#
# CF-FCAP1-FLAKE / CF-SATURATION-1: fcap1_h2_over_cap_upload_yields_413
# must push a 66 MiB body past the 64 MiB cap to assert 413. On the hosted
# runner, sharing CPU with its ~14 sibling tests starves that upload, so it
# is isolated and run ALONE below (full CPU, uncontended). This is
# serialization, not weakening: same test, same assert_eq!(status, 413).
- name: cargo test (suite minus the saturation-isolated fcap1)
run: cargo test --workspace --all-features --no-fail-fast -- --skip fcap1_h2_over_cap_upload_yields_413
timeout-minutes: 45
- name: cargo test (fcap1 over-cap, isolated / serial, retry on env-flake)
# fcap1 must transfer 66 MiB past the const 64 MiB cap; on a hosted
# runner the achievable rate varies (~0.3-1.1 MiB/s) so a single attempt
# can still be starved by a noisy neighbour even when isolated. The
# gateway deadlines are raised to 300 s (see the test); up to 3 attempts:
# a genuine cap regression fails ALL three (assertion unchanged), only a
# slow-transfer env-flake is retried. CF-SATURATION-1.
run: |
for attempt in 1 2 3; do
echo "::group::fcap1 attempt $attempt"
if cargo test -p lb-integration-tests --test h2h1_md_streaming_verify \
--all-features -- --exact fcap1_h2_over_cap_upload_yields_413 --nocapture; then
echo "::endgroup::"; echo "fcap1 passed on attempt $attempt"; exit 0
fi
echo "::endgroup::"; echo "fcap1 attempt $attempt failed (env throughput?); retrying"
done
echo "::error::fcap1 failed all 3 attempts — this is a real cap-enforcement failure, not an env flake"
exit 1
timeout-minutes: 25
msrv:
# `cargo check --all-targets --all-features` on the pinned MSRV is the
# canonical "compiles on 1.88" gate; full-codegen runs in release-build.
name: MSRV (1.88)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
toolchain: ${{ env.RUST_MSRV }}
- run: cargo check --workspace --all-targets --all-features
fuzz-smoke:
name: Fuzz Smoke Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
toolchain: nightly
- name: Install cargo-fuzz
# Build cargo-fuzz on +nightly (this job's installed toolchain), NOT the
# repo's pinned 1.88. Without an explicit +toolchain the repo's
# rust-toolchain.toml (1.88) overrides the default and a recent
# cargo-platform (MSRV 1.91, a cargo-fuzz dep) fails to build. nightly is
# the toolchain this job already installs and is what runs the targets
# below — so it is guaranteed present and new enough.
run: cargo +nightly install cargo-fuzz --locked
- name: Smoke test all fuzz targets (10 seconds each)
run: |
cd fuzz
targets=$(cargo +nightly fuzz list)
if [ -z "$targets" ]; then
echo "::error::No fuzz targets discovered in fuzz/Cargo.toml"
exit 1
fi
for target in $targets; do
echo "::group::Fuzzing $target"
cargo +nightly fuzz run "$target" -- -max_total_time=10 2>&1
echo "::endgroup::"
done
release-build:
name: Release Build
runs-on: ubuntu-latest
needs: [check, clippy, test, fmt, panic-freedom, doc-lint]
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
- run: cargo build --workspace --release
timeout-minutes: 30
# =================================================================
# SECTION 3 — security & dependency gates
# =================================================================
audit:
name: Security Audit
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
# cargo-audit needs a recent rustc to build; rust-toolchain.toml pins 1.88
# but we `+stable install` so cargo-audit always builds on latest stable.
- uses: ./.github/actions/rust-setup
- name: Install cargo-audit
run: cargo +stable install cargo-audit --locked
# SEC-2-07: fail on ANY RUSTSEC advisory. Explicit ignores must live in
# .cargo/audit.toml with a justification + link. The SAME strict audit runs
# weekly in scheduled.yml to catch advisories published against unchanged
# deps between PRs.
- run: cargo audit -D warnings
cargo-deny:
# advisories / licenses / bans / sources. cargo-deny is a standalone prebuilt
# binary (no toolchain needed). The explicit subcommand list keeps a future
# cargo-deny "check" default from silently shrinking the gate. This is the
# SINGLE cargo-deny gate (the duplicate was removed at S34).
name: cargo-deny (licenses/advisories/bans/sources)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Install cargo-deny (prebuilt binary)
run: |
VER=0.19.6
curl -fsSL "https://github.com/EmbarkStudios/cargo-deny/releases/download/${VER}/cargo-deny-${VER}-x86_64-unknown-linux-musl.tar.gz" \
| tar -xz --strip-components=1 -C /usr/local/bin --wildcards '*/cargo-deny'
cargo-deny --version
- run: cargo-deny check licenses advisories bans sources
# =================================================================
# SECTION 4 — coverage & protocol conformance
# =================================================================
coverage:
# Hot-path coverage, PER-MODULE >= 80% (audit/coverage-scope.md charter
# metric, NOT a whole-package aggregate). Runs the FULL workspace suite under
# instrumentation so the integration tests exercise the hot paths, then
# enforces >= 80% per hot-path module via scripts/ci/coverage-check.sh
# (lb-l4-xdp/src/loader.rs carved out by name — XDP load needs root, smoke-
# validated by the xdp-smoke job). A new hot-path module under 80% -> RED.
name: Coverage (per-module hot-path >= 80%)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
toolchain: ${{ env.RUST_MSRV }}
components: llvm-tools-preview
system-deps: 'true'
- uses: ./.github/actions/free-disk # instrumented --workspace build is ~28 GB
- name: Install cargo-llvm-cov + nextest
run: |
curl -fsSL https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar -xz -C ~/.cargo/bin
curl -fsSL https://get.nexte.st/latest/linux | tar -xz -C ~/.cargo/bin
- name: Coverage (full workspace suite, --all-features)
# --ignore-run-fail: this is the COVERAGE gate, not the pass/fail gate
# (that is the `test` job). Measure what the suite covers even if a test
# flakes; coverage-check.sh below is the actual verdict.
run: |
cargo llvm-cov nextest \
--workspace --all-features --ignore-run-fail \
--lcov --output-path coverage.lcov
- name: Enforce per-module hot-path threshold (charter metric)
run: bash scripts/ci/coverage-check.sh coverage.lcov
- uses: actions/upload-artifact@v7
if: always()
with:
name: coverage-lcov
path: coverage.lcov
conformance:
# HTTP/2 + HTTP/3 conformance against the REAL gateway: an h1s listener
# (TLS, ALPN h2/http1.1) on TCP :8443 -> h2spec; a quic listener
# (H3-terminate, quiche::h3) on UDP :8444 -> h3spec. Tool versions are PINNED
# so the h3spec waiver list (CF-QUICHE-UPGRADE) stays exact.
name: Conformance (h2spec --strict + h3spec)
runs-on: ubuntu-latest
env:
H2SPEC_VER: "v2.6.0"
H3SPEC_VER: "v0.1.13"
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
toolchain: ${{ env.RUST_MSRV }}
system-deps: 'true'
- run: cargo build -p lb
- name: Stub backend + test cert/key + QUIC retry secret
run: |
python3 -m http.server 3000 >/tmp/backend.log 2>&1 &
echo $! > backend.pid
# Self-signed cert (DNS + IP SANs); h2spec uses -k and h3spec uses -n
# so cert validation is skipped (protocol conformance, not PKI) — but
# the cert must still load into rustls + BoringSSL.
openssl req -x509 -newkey rsa:2048 -keyout key.pem -out cert.pem \
-days 1 -nodes -subj "/CN=localhost" \
-addext "subjectAltName=DNS:localhost,IP:127.0.0.1"
head -c 32 /dev/urandom > retry.bin
- name: Write conformance config (lb-config schema)
run: |
cat > conformance.toml <<TOML
[runtime]
drain_timeout_ms = 5000
readiness_settle_ms = 100
# H1+H2 over TLS (ALPN h2 then http/1.1) -> h2spec target (TCP).
[[listeners]]
address = "127.0.0.1:8443"
protocol = "h1s"
[listeners.tls]
cert_path = "$PWD/cert.pem"
key_path = "$PWD/key.pem"
[[listeners.backends]]
address = "127.0.0.1:3000"
protocol = "h1"
weight = 1
# HTTP/3 over QUIC, H3-terminate (quiche::h3) -> h3spec target (UDP).
[[listeners]]
address = "127.0.0.1:8444"
protocol = "quic"
[listeners.quic]
cert_path = "$PWD/cert.pem"
key_path = "$PWD/key.pem"
retry_secret_path = "$PWD/retry.bin"
[[listeners.backends]]
address = "127.0.0.1:3000"
protocol = "h1"
weight = 1
[observability]
metrics_bind = "127.0.0.1:9090"
TOML
cat conformance.toml
- name: Boot gateway + wait for BOTH listeners
run: |
./target/debug/expressgateway conformance.toml >/tmp/gw.log 2>&1 &
echo $! > gw.pid
up=0
for i in $(seq 1 40); do
if ss -ltn | grep -q ':8443' && ss -lun | grep -q ':8444'; then up=1; break; fi
sleep 1
done
if [ "$up" -ne 1 ]; then
echo "::error::gateway listeners did not come up"; cat /tmp/gw.log; exit 1
fi
echo "both listeners up; gateway log:"; tail -5 /tmp/gw.log
- name: h2spec --strict (HTTP/2 conformance, TCP :8443)
run: |
curl -fsSL "https://github.com/summerwind/h2spec/releases/download/${H2SPEC_VER}/h2spec_linux_amd64.tar.gz" | tar -xz
# -k: skip cert validation (protocol conformance, not PKI). --strict:
# fail on ANY non-conformant case. The gateway passes 147/147.
./h2spec -t -k -h 127.0.0.1 -p 8443 --strict
- name: h3spec (HTTP/3 conformance, UDP :8444) via named-waiver gate
run: |
curl -fsSL -o h3spec "https://github.com/kazu-yamamoto/h3spec/releases/download/${H3SPEC_VER}/h3spec-linux-x86_64"
chmod +x h3spec
# The wrapper runs h3spec -n and PASSES iff every failure is one of the
# 12 individually-named, documented quiche-0.29 limitations
# (CF-QUICHE-UPGRADE). A NEW/un-waived failure turns this RED.
bash scripts/ci/h3spec-check.sh ./h3spec 127.0.0.1 8444
- name: Stop gateway + backend
if: always()
run: |
kill "$(cat gw.pid)" 2>/dev/null || true
kill "$(cat backend.pid)" 2>/dev/null || true
chaos-attacks:
# The chaos ATTACK suite (the half that does not need a soak host): Rapid
# Reset, CONTINUATION flood, HPACK bomb, slowloris. The 4-hour soak (D-3b)
# is the release-soak gate (scripts/release-soak.sh), not a PR gate.
name: Chaos Attack Suite
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
toolchain: ${{ env.RUST_MSRV }}
system-deps: 'true'
- name: Install cargo-nextest
run: curl -fsSL https://get.nexte.st/latest/linux | tar -xz -C ~/.cargo/bin
- run: cargo build -p lb
- name: Run chaos attack suite
# Round-8 wired Smuggle/Slowloris/SlowPost detectors + the
# CONTINUATION/Rapid-Reset caps. --all-features keeps the lb-l7/lb-h2
# test surface identical to the session gate.
run: |
cargo nextest run --all-features --package lb-h2 --package lb-l7 \
-E 'test(/chaos|rapid_reset|continuation|hpack|slowloris/)'
# =================================================================
# SECTION 5 — container image & XDP datapath
# =================================================================
image-scan:
# Real docker build + RUN+SERVE smoke + trivy scan. Builds the image ONCE
# and (a) proves it BOOTS and SERVES L7 traffic via docker-smoke.sh — a real
# HTTP/1.1 request through the live container must return the backend's
# 200 + body — and only then (b) Trivy-scans the same image. `docker build`
# exit-0 alone was never proof the container works (a pre-S35 image had a
# wrong CMD that could not boot — caught here now).
name: Container Image (build + serve smoke + trivy)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- name: Build image
run: |
docker build -f docker/Dockerfile \
--build-arg GIT_SHA="${GITHUB_SHA::8}" \
--build-arg BUILD_DATE="$(date -u +%Y-%m-%dT%H:%M:%SZ)" \
--build-arg VERSION="ci-${GITHUB_SHA::8}" \
-t expressgateway:ci .
- name: Smoke — RUN the container and serve a real request through it
run: IMAGE=expressgateway:ci bash scripts/ci/docker-smoke.sh
- name: Trivy image scan
uses: aquasecurity/trivy-action@master
with:
image-ref: expressgateway:ci
format: table
exit-code: "1" # fail the job on findings
severity: HIGH,CRITICAL
ignore-unfixed: true
xdp-smoke:
# XDP verifier-accept smoke (RUNNER-KERNEL ONLY). Loads the committed XDP ELF
# (crates/lb-l4-xdp/src/lb_xdp.bin) into the verifier of the runner's OWN
# kernel (GitHub hosted = Linux 6.x) and asserts ACCEPT. A true, meaningful
# claim: "the shipped XDP object builds and the in-kernel verifier on a
# current kernel accepts it." It does NOT cover the full 5.15/6.1/6.6 matrix
# (that needs nested virt the hosted runners lack — F-ESC-1, self-hosted).
name: XDP Verifier Smoke (runner kernel)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: ./.github/actions/rust-setup
with:
toolchain: ${{ env.RUST_MSRV }}
system-deps: 'true'
- name: Print runner kernel (for the record — smoke is THIS kernel only)
run: uname -r
- name: Build the real-load verifier test (validates the embedded ELF too)
run: cargo test -p lb-l4-xdp --test round8_verifier_baseline_70 --no-run
- name: Load the committed XDP ELF and assert verifier ACCEPT
# The committed ELF (built by scripts/build-xdp.sh) uses aya LEGACY map
# definitions, which libbpf v1.0+ tools (bpftool, ip) REFUSE to load. aya
# is the only loader that can load it — and it is the gateway's OWN
# loader. round8_verifier_baseline_70 does a genuine BPF_PROG_LOAD via
# XdpLoader::load_from_bytes_pinned + kernel_load on the RUNNING kernel
# (no NIC/attach) and asserts real kernel facts (prog_id, tag,
# verified_insns > 0). Needs CAP_BPF + bpffs, hence sudo + --ignored.
run: |
set -euo pipefail
sudo -E env "PATH=$PATH" cargo test -p lb-l4-xdp \
--test round8_verifier_baseline_70 -- --ignored --nocapture