Skip to content

Commit e0ce4f4

Browse files
authored
fix(conversation): recover dead ACP turns after agent process loss (#514)
## Summary - Add ACP turn recovery policy support to automatically replay safe retryable turns once after upstream agent/process failures. - Evict unhealthy ACP tasks after recoverable terminal errors so later stop/cancel/send flows bind to a fresh runtime instead of a dead task. - Keep team child-turn status handling transparent across recovered replays and cover recovery safety boundaries with focused tests. ## Test Plan - [x] `just push -u origin feat/agent-turn-recoverable-error` (migration immutability, cargo fix/fmt, clippy, and `cargo nextest run --workspace`: 6379 passed, 18 skipped) - [x] Local AionUi Dev solo smoke for Claude, Codex, and OpenCode: kill ACP child process immediately after sending; subsequent stop/cancel/send remains usable - [x] Local AionUi Dev team smoke: kill ACP child process during a team turn; child turn auto-recovers and the team run completes --------- Co-authored-by: zynx <>
1 parent 6b46055 commit e0ce4f4

9 files changed

Lines changed: 1343 additions & 112 deletions

File tree

crates/aionui-ai-agent/src/protocol/send_error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,10 @@ impl AgentSendError {
217217
self.stream_error
218218
}
219219

220+
pub fn from_stream_error_data(stream_error: AgentStreamErrorData) -> Self {
221+
Self { stream_error }
222+
}
223+
220224
pub fn code(&self) -> Option<AgentErrorCode> {
221225
self.stream_error.code
222226
}

crates/aionui-conversation/src/agent_health_policy.rs

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@ impl AgentHealthPolicy {
2222
outcome: &RelayOutcome,
2323
lifecycle: RuntimeLifecycleState,
2424
) -> AgentHealthAction {
25-
if lifecycle != RuntimeLifecycleState::Active {
25+
if matches!(
26+
lifecycle,
27+
RuntimeLifecycleState::Deleting | RuntimeLifecycleState::ShuttingDown
28+
) {
2629
return AgentHealthAction::Keep;
2730
}
2831
if agent_type != AgentType::Acp || !outcome.terminal.is_error() {
@@ -45,11 +48,11 @@ mod tests {
4548

4649
fn error_outcome(code: Option<AgentErrorCode>) -> RelayOutcome {
4750
RelayOutcome {
48-
system_responses: Vec::new(),
4951
terminal: RelayTerminal::Error {
5052
code,
5153
retryable: Some(true),
5254
},
55+
..RelayOutcome::default()
5356
}
5457
}
5558

@@ -98,15 +101,27 @@ mod tests {
98101
#[test]
99102
fn channel_closed_does_not_evict_by_default() {
100103
let outcome = RelayOutcome {
101-
system_responses: Vec::new(),
102104
terminal: RelayTerminal::ChannelClosed,
105+
..RelayOutcome::default()
103106
};
104107
assert_eq!(
105108
AgentHealthPolicy::decide(AgentType::Acp, &outcome, RuntimeLifecycleState::Active),
106109
AgentHealthAction::Keep
107110
);
108111
}
109112

113+
#[test]
114+
fn cancelling_acp_terminal_error_evicts_task() {
115+
let outcome = error_outcome(Some(AgentErrorCode::UnknownUpstreamError));
116+
assert!(matches!(
117+
AgentHealthPolicy::decide(AgentType::Acp, &outcome, RuntimeLifecycleState::Cancelling),
118+
AgentHealthAction::EvictAcpTask {
119+
clear_model_seed: false,
120+
..
121+
}
122+
));
123+
}
124+
110125
#[test]
111126
fn non_active_lifecycle_keeps_task() {
112127
let outcome = error_outcome(Some(AgentErrorCode::UserLlmProviderModelNotFound));

crates/aionui-conversation/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ pub mod stream_relay;
2424
pub mod task_options;
2525
mod turn_continuation_policy;
2626
mod turn_orchestrator;
27+
mod turn_recovery_policy;
2728

2829
pub use error::ConversationError;
2930
pub use response_middleware::{

0 commit comments

Comments
 (0)