Skip to content

Commit 8058d53

Browse files
committed
fix(verify): guard process-group cleanup
1 parent 53488bb commit 8058d53

4 files changed

Lines changed: 140 additions & 16 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Release entries are curated summaries for readers. Work item traceability remain
2626

2727
### Fixed
2828

29-
- Fixed guard execution for noisy commands and detached child processes by collecting output safely and reporting timeout state more clearly.
29+
- Fixed guard execution for noisy commands, detached child processes, and CI-safe process-group cleanup by collecting output safely, only signaling isolated guard child process groups, and reporting timeout state more clearly.
3030
- Fixed `self-update` archive extraction so release archives and cargo-binstall resolve binaries under `govctl-v{{ version }}-{{ target }}/{{ bin }}`.
3131
- Rendered acceptance-criterion category labels in `govctl work show`, giving reviewers the same category signal as raw TOML.
3232

gov/releases.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ refs = [
1818
"WI-2026-05-25-005",
1919
"WI-2026-05-25-006",
2020
"WI-2026-05-25-007",
21+
"WI-2026-05-25-008",
2122
]
2223

2324
[[releases]]
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#:schema ../schema/work.schema.json
2+
3+
[govctl]
4+
id = "WI-2026-05-25-008"
5+
title = "Fix guard process-group cleanup killing CI"
6+
status = "done"
7+
created = "2026-05-25"
8+
started = "2026-05-25"
9+
completed = "2026-05-25"
10+
11+
[content]
12+
description = "Fix repeated GitHub Actions SIGTERM failures during verification tests after the guard process-group cleanup change. The guard runner must only signal process groups it has proven are isolated guard child groups, so timeout cleanup cannot terminate the CI runner or cargo test process."
13+
14+
[[content.journal]]
15+
date = "2026-05-25"
16+
scope = "ci"
17+
content = """
18+
Investigated repeated GitHub Actions failures on commit 53488bb0. CI and Pre-commit both reached `cargo test --all-features` and were terminated with SIGTERM/exit 143 during `tests/test_verify.rs`, with no Rust assertion or compilation failure. This points to the recent guard process-group cleanup path rather than changelog or docs changes.
19+
20+
Updated guard cleanup to capture the actual child process group after spawn, reject the current runner/cargo process group, require the group to match the isolated child PID, and signal it via direct Unix `kill` only after those checks. Added Unix regression coverage for rejecting the current process group and capturing an isolated child group."""
21+
22+
[[content.journal]]
23+
date = "2026-05-25"
24+
scope = "validation"
25+
content = "Verified the process-group cleanup fix with cargo fmt --all -- --check, cargo test --test test_verify -- --test-threads=1, cargo test verification::unix_process_group_tests --all-features, cargo clippy --all-features -- -D warnings, full cargo test --all-features, and cargo run --quiet -- check."
26+
27+
[[content.acceptance_criteria]]
28+
text = "cargo fmt, relevant tests, full tests, clippy, and govctl check pass"
29+
status = "done"
30+
category = "chore"
31+
32+
[[content.acceptance_criteria]]
33+
text = "Regression coverage checks guard process-group capture rejects the current process group"
34+
status = "done"
35+
category = "chore"
36+
37+
[[content.acceptance_criteria]]
38+
text = "Guard process cleanup only signals an isolated guard child process group, never the current runner process group"
39+
status = "done"
40+
category = "fixed"

src/verification.rs

Lines changed: 98 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ pub fn run_guard(config: &Config, guard: &GuardEntry) -> Result<GuardRunResult,
229229
)
230230
})?;
231231
let child_id = child.id();
232+
let process_group = guard_process_group(child_id);
232233

233234
let deadline = Duration::from_secs(timeout);
234235
let started = Instant::now();
@@ -240,7 +241,7 @@ pub fn run_guard(config: &Config, guard: &GuardEntry) -> Result<GuardRunResult,
240241
match child.try_wait() {
241242
Ok(Some(exit_status)) => {
242243
status = exit_status;
243-
terminate_guard_process_group(child_id);
244+
terminate_guard_process_group(process_group);
244245
break;
245246
}
246247
Ok(None) if started.elapsed() < deadline => {
@@ -249,7 +250,7 @@ pub fn run_guard(config: &Config, guard: &GuardEntry) -> Result<GuardRunResult,
249250
Ok(None) => {
250251
timed_out = true;
251252
primary_shell_running_at_timeout = Some(true);
252-
terminate_guard_process(&mut child);
253+
terminate_guard_process(&mut child, process_group);
253254
status = child.wait().map_err(|err| {
254255
Diagnostic::new(
255256
DiagnosticCode::E1004GuardCheckFailed,
@@ -386,30 +387,86 @@ fn configure_guard_process_group(command: &mut Command) {
386387
#[cfg(not(unix))]
387388
fn configure_guard_process_group(_command: &mut Command) {}
388389

389-
fn terminate_guard_process(child: &mut Child) {
390-
terminate_guard_process_group(child.id());
390+
#[cfg(unix)]
391+
#[derive(Clone, Copy)]
392+
struct GuardProcessGroup {
393+
pgid: i32,
394+
}
395+
396+
#[cfg(not(unix))]
397+
#[derive(Clone, Copy)]
398+
struct GuardProcessGroup;
399+
400+
#[cfg(unix)]
401+
fn guard_process_group(child_id: u32) -> Option<GuardProcessGroup> {
402+
let child_pid = i32::try_from(child_id).ok()?;
403+
let pgid = unix_getpgid(child_pid)?;
404+
let current_pgid = unix_getpgrp()?;
405+
406+
if pgid == current_pgid || pgid != child_pid {
407+
return None;
408+
}
409+
410+
Some(GuardProcessGroup { pgid })
411+
}
412+
413+
#[cfg(not(unix))]
414+
fn guard_process_group(_child_id: u32) -> Option<GuardProcessGroup> {
415+
None
416+
}
417+
418+
fn terminate_guard_process(child: &mut Child, process_group: Option<GuardProcessGroup>) {
419+
terminate_guard_process_group(process_group);
391420
let _ = child.kill();
392421
}
393422

394423
#[cfg(unix)]
395-
fn terminate_guard_process_group(child_id: u32) {
396-
signal_process_group(child_id, "TERM");
424+
fn terminate_guard_process_group(process_group: Option<GuardProcessGroup>) {
425+
let Some(process_group) = process_group else {
426+
return;
427+
};
428+
429+
signal_process_group(process_group, SIGTERM);
397430
std::thread::sleep(Duration::from_millis(25));
398-
signal_process_group(child_id, "KILL");
431+
signal_process_group(process_group, SIGKILL);
399432
}
400433

401434
#[cfg(unix)]
402-
fn signal_process_group(child_id: u32, signal: &str) {
403-
let _ = Command::new("/bin/kill")
404-
.arg(format!("-{signal}"))
405-
.arg(format!("-{child_id}"))
406-
.stdout(Stdio::null())
407-
.stderr(Stdio::null())
408-
.status();
435+
fn signal_process_group(process_group: GuardProcessGroup, signal: i32) {
436+
// Negative PID targets the process group. The group is captured after spawn
437+
// and rejected if it is not the isolated child group.
438+
unsafe {
439+
let _ = kill(-process_group.pgid, signal);
440+
}
409441
}
410442

411443
#[cfg(not(unix))]
412-
fn terminate_guard_process_group(_child_id: u32) {}
444+
fn terminate_guard_process_group(_process_group: Option<GuardProcessGroup>) {}
445+
446+
#[cfg(unix)]
447+
const SIGTERM: i32 = 15;
448+
449+
#[cfg(unix)]
450+
const SIGKILL: i32 = 9;
451+
452+
#[cfg(unix)]
453+
fn unix_getpgid(pid: i32) -> Option<i32> {
454+
let pgid = unsafe { getpgid(pid) };
455+
(pgid > 0).then_some(pgid)
456+
}
457+
458+
#[cfg(unix)]
459+
fn unix_getpgrp() -> Option<i32> {
460+
let pgid = unsafe { getpgrp() };
461+
(pgid > 0).then_some(pgid)
462+
}
463+
464+
#[cfg(unix)]
465+
unsafe extern "C" {
466+
fn getpgid(pid: i32) -> i32;
467+
fn getpgrp() -> i32;
468+
fn kill(pid: i32, sig: i32) -> i32;
469+
}
413470

414471
pub fn unknown_guard_diagnostic(guard_id: &str, location: &str) -> Diagnostic {
415472
Diagnostic::new(
@@ -431,3 +488,29 @@ fn dedup_guard_ids(ids: impl IntoIterator<Item = String>) -> Vec<String> {
431488

432489
deduped
433490
}
491+
492+
#[cfg(all(test, unix))]
493+
mod unix_process_group_tests {
494+
use super::*;
495+
496+
#[test]
497+
fn current_process_group_is_not_treated_as_guard_group() {
498+
assert!(guard_process_group(std::process::id()).is_none());
499+
}
500+
501+
#[test]
502+
fn isolated_child_process_group_is_captured() -> Result<(), Box<dyn std::error::Error>> {
503+
let mut command = Command::new("/bin/sleep");
504+
command.arg("5");
505+
configure_guard_process_group(&mut command);
506+
507+
let mut child = command.spawn()?;
508+
let process_group = guard_process_group(child.id());
509+
510+
assert!(process_group.is_some());
511+
terminate_guard_process(&mut child, process_group);
512+
let _ = child.wait();
513+
514+
Ok(())
515+
}
516+
}

0 commit comments

Comments
 (0)