Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions crates/ironclaw_host_runtime/CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,18 @@

- `turn_scheduler.rs` owns scheduler-backed run concurrency. It does not own the
canonical loop tick or product inbound serialization.
- Concurrency invariant: the scheduler runs up to `max_concurrent_runs` runs in
parallel, one `tokio::spawn` + `OwnedSemaphorePermit` per run. The single
`TurnRunnerId` is the scheduler *instance*, not a per-run worker — seeing one
`TurnRunnerId` is expected and does NOT mean serial execution.
- `TurnRunSchedulerConfig::default().max_concurrent_runs` must equal
`DEFAULT_MAX_CONCURRENT_RUNS` (do not re-introduce a divergent literal —
`Default` previously hard-coded `4` while production used `16`). The
production worker-count default `ironclaw_reborn::DEFAULT_TURN_RUNNER_WORKER_COUNT`
is *derived from* `DEFAULT_MAX_CONCURRENT_RUNS`, so the scheduler cap and the
worker-count default are the same value by construction; keep the constant
above `1`. A configured `worker_count = 1` is legal but serializes all runs
through one slot — the CLI resolver warns on it.
- `surface.rs` owns host-runtime capability-surface shaping and versions.
- `production.rs` and `services.rs` compose runtime services and readiness
evidence used by Reborn loop wiring.
Expand Down
5 changes: 3 additions & 2 deletions crates/ironclaw_host_runtime/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,9 @@ pub use services::{
};
pub use surface::{CapabilitySurfacePolicy, VisibleCapability, VisibleCapabilityAccess};
pub use turn_scheduler::{
SchedulerTurnRunWakeNotifier, TurnRunExecutor, TurnRunExecutorError, TurnRunScheduler,
TurnRunSchedulerConfig, TurnRunSchedulerHandle, TurnRunWakeChannel,
DEFAULT_MAX_CONCURRENT_RUNS, SchedulerTurnRunWakeNotifier, TurnRunExecutor,
TurnRunExecutorError, TurnRunScheduler, TurnRunSchedulerConfig, TurnRunSchedulerHandle,
TurnRunWakeChannel,
};

/// Stable, validated idempotency key supplied by upper turn/loop services.
Expand Down
17 changes: 16 additions & 1 deletion crates/ironclaw_host_runtime/src/turn_scheduler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,21 @@ use tokio_util::sync::CancellationToken;
use tracing::Instrument;
use tracing::debug;

/// Canonical default for the scheduler's concurrent-run cap (the number of turn
/// runs that may be `Running` at once on a single scheduler instance).
///
/// This is the single source of truth for the scheduler default. The production
/// composition (`ironclaw_reborn`) always overrides the running config via
/// [`TurnRunSchedulerConfig::with_max_concurrent_runs`] using its
/// `DEFAULT_TURN_RUNNER_WORKER_COUNT` knob, which is itself *derived from* this
/// constant — so the worker-count default and the scheduler cap are the same
/// value by construction. (The derivation only works one way: `ironclaw_reborn`
/// depends on `ironclaw_host_runtime`, not the reverse.)
///
/// `1` is a legal value but degenerate: it serializes every run through one
/// permit. Keep this comfortably above 1.
pub const DEFAULT_MAX_CONCURRENT_RUNS: usize = 16;

#[derive(Debug, Clone)]
pub struct TurnRunSchedulerConfig {
max_concurrent_runs: usize,
Expand All @@ -35,7 +50,7 @@ pub struct TurnRunSchedulerConfig {
impl Default for TurnRunSchedulerConfig {
fn default() -> Self {
Self {
max_concurrent_runs: 4,
max_concurrent_runs: DEFAULT_MAX_CONCURRENT_RUNS,
poll_interval: Duration::from_secs(5),
lease_recovery_interval: Duration::from_secs(10),
runner_heartbeat_interval: Duration::from_secs(30),
Expand Down
41 changes: 40 additions & 1 deletion crates/ironclaw_host_runtime/src/turn_scheduler/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,46 @@ use ironclaw_turns::{
},
};

use super::{TurnRunExecutor, TurnRunExecutorError, TurnRunScheduler, TurnRunSchedulerConfig};
use super::{
DEFAULT_MAX_CONCURRENT_RUNS, TurnRunExecutor, TurnRunExecutorError, TurnRunScheduler,
TurnRunSchedulerConfig,
};

// ── Config defaults ───────────────────────────────────────────────────────

/// The scheduler's `Default` concurrent-run cap must equal the canonical
/// constant. This pins the historical regression where `Default` hard-coded `4`
/// while the production composition used `16`, so a `Default`-only caller would
/// silently under-provision the pool. Before the fix this asserted against `4`
/// and failed.
#[test]
fn default_config_uses_canonical_max_concurrent_runs() {
// The default pool must allow concurrency (>1); 1 serializes all runs.
// Checked at compile time so the invariant holds even if the constant moves.
const _: () = assert!(DEFAULT_MAX_CONCURRENT_RUNS > 1);

assert_eq!(
TurnRunSchedulerConfig::default().max_concurrent_runs(),
DEFAULT_MAX_CONCURRENT_RUNS,
"Default scheduler config must use DEFAULT_MAX_CONCURRENT_RUNS, not a divergent literal"
);
}

/// `with_max_concurrent_runs` floors to a non-zero value: 0 must not produce a
/// scheduler that can never claim a run.
#[test]
fn with_max_concurrent_runs_floors_zero_to_one() {
let config = TurnRunSchedulerConfig::default().with_max_concurrent_runs(0);
assert_eq!(config.max_concurrent_runs(), 1);
}

/// The exact floor boundary: an explicit `1` must be preserved as `1` (not
/// floored up to 2, not dropped to 0). Guards a refactor that mis-edits `.max`.
#[test]
fn with_max_concurrent_runs_keeps_one() {
let config = TurnRunSchedulerConfig::default().with_max_concurrent_runs(1);
assert_eq!(config.max_concurrent_runs(), 1);
}

// ── Minimal fakes ────────────────────────────────────────────────────────

Expand Down
34 changes: 28 additions & 6 deletions crates/ironclaw_reborn/src/runtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,17 @@ use crate::{

/// Default number of turn-runner worker tasks spawned per runtime instance.
///
/// Used by [`DefaultPlannedRuntimeConfig`] and [`TurnRunnerSettings`] so the
/// value is defined exactly once and shared across all crates in the stack.
/// Derived from the scheduler's own
/// [`ironclaw_host_runtime::DEFAULT_MAX_CONCURRENT_RUNS`] so the worker-count
/// default and the scheduler's concurrent-run cap are the same value by
/// construction (no literal duplication, no drift). Used by
/// [`DefaultPlannedRuntimeConfig`] and [`TurnRunnerSettings`].
pub const DEFAULT_TURN_RUNNER_WORKER_COUNT: std::num::NonZeroUsize =
match std::num::NonZeroUsize::new(16) {
match std::num::NonZeroUsize::new(ironclaw_host_runtime::DEFAULT_MAX_CONCURRENT_RUNS) {
Some(v) => v,
// 16 is a non-zero compile-time constant so this arm is never reached.
// `NonZeroUsize::MIN` (= 1) is used as a non-panicking fallback so the
// CI "no panics in production code" check stays green.
// DEFAULT_MAX_CONCURRENT_RUNS is a non-zero compile-time constant so this
// arm is never reached. `NonZeroUsize::MIN` (= 1) is a non-panicking
// fallback so the CI "no panics in production code" check stays green.
None => std::num::NonZeroUsize::MIN,
};

Expand Down Expand Up @@ -782,6 +785,25 @@ mod tests {
DecoratingLoopCapabilityPortFactory, LoopCapabilityPortDecorator, LoopCapabilityPortFactory,
};

/// The worker-count default is derived from the scheduler's canonical
/// concurrency default, so the two are equal by construction. This test
/// documents and locks that contract — and asserts the shared value allows
/// real concurrency (> 1), which is the property a misconfiguration or a
/// future literal-override would silently break.
#[test]
fn worker_count_default_matches_scheduler_default() {
assert_eq!(
super::DEFAULT_TURN_RUNNER_WORKER_COUNT.get(),
ironclaw_host_runtime::DEFAULT_MAX_CONCURRENT_RUNS,
"DEFAULT_TURN_RUNNER_WORKER_COUNT must stay equal to \
ironclaw_host_runtime::DEFAULT_MAX_CONCURRENT_RUNS"
);
assert!(
super::DEFAULT_TURN_RUNNER_WORKER_COUNT.get() > 1,
"the default worker pool must allow concurrency (> 1)"
);
}

async fn test_run_context() -> LoopRunContext {
let tenant_id = TenantId::new("tenant-runtime-test").unwrap();
let agent_id = AgentId::new("agent-runtime-test").unwrap();
Expand Down
88 changes: 88 additions & 0 deletions crates/ironclaw_reborn_cli/src/runtime/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -991,6 +991,19 @@ fn runner_settings(
})?
};

// A worker_count of 1 is legal but degenerate: it serializes every turn
// run through a single scheduler permit. Operators almost never want
// this for a multi-trigger / multi-user deployment. We honour the
// explicit value (silently overriding it would be surprising) but make
// the loss of concurrency loud at boot.
if settings.worker_count.get() == 1 {
tracing::warn!(
worker_count = 1,
"config file [runner].worker_count = 1 serializes all turn runs through one \
scheduler slot; set it to >= 2 for concurrent runs"
);
}

// Each cap: absent in the file → keep the struct default already in
// `settings`; explicit `0` → "unlimited" sentinel (None); positive → cap.
settings.max_concurrent_runs_per_user = resolve_concurrency_cap(
Expand Down Expand Up @@ -1103,6 +1116,81 @@ mod tests {
assert_eq!(settings.worker_count.get(), MAX_WORKER_COUNT);
}

#[test]
fn runner_settings_worker_count_one_is_honoured() {
// `1` is a legal-but-degenerate value: the resolver must keep it (it
// does NOT silently override the operator), so the scheduler will be
// single-slot. The accompanying warn is asserted separately below.
let cfg = parse_runner_section("[runner]\nworker_count = 1\n");
let settings = runner_settings(Some(&cfg)).expect("should succeed");
assert_eq!(settings.worker_count.get(), 1);
}

/// Run `runner_settings` for the given `[runner]` TOML under a capturing
/// tracing subscriber and return whether a WARN mentioning `worker_count`
/// was emitted. Driving the REAL resolver is the point — a unit test on the
/// predicate alone would not catch the warn being dropped from the path.
fn resolver_emits_worker_count_warn(runner_toml: &str) -> bool {
use std::sync::{Arc, Mutex};
use tracing_subscriber::Layer;
use tracing_subscriber::layer::Context;
use tracing_subscriber::prelude::*;

#[derive(Clone, Default)]
struct CaptureLayer {
events: Arc<Mutex<Vec<(tracing::Level, String)>>>,
}
struct MessageVisitor(String);
impl tracing::field::Visit for MessageVisitor {
fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
if field.name() == "message" {
self.0 = format!("{value:?}");
}
}
}
impl<S: tracing::Subscriber> Layer<S> for CaptureLayer {
fn on_event(&self, event: &tracing::Event<'_>, _ctx: Context<'_, S>) {
let mut visitor = MessageVisitor(String::new());
event.record(&mut visitor);
self.events
.lock()
.expect("lock")
.push((*event.metadata().level(), visitor.0));
}
}

let layer = CaptureLayer::default();
let events = Arc::clone(&layer.events);
let subscriber = tracing_subscriber::registry().with(layer);

let cfg = parse_runner_section(runner_toml);
tracing::subscriber::with_default(subscriber, || {
runner_settings(Some(&cfg)).expect("should succeed");
});

events
.lock()
.expect("lock")
.iter()
.any(|(level, msg)| *level == tracing::Level::WARN && msg.contains("worker_count"))
}

#[test]
fn runner_settings_warns_on_degenerate_worker_count_one() {
assert!(
resolver_emits_worker_count_warn("[runner]\nworker_count = 1\n"),
"worker_count = 1 must emit a WARN about serialized runs"
);
}

#[test]
fn runner_settings_no_warn_on_normal_worker_count() {
assert!(
!resolver_emits_worker_count_warn("[runner]\nworker_count = 7\n"),
"a normal worker_count must NOT emit the serialization WARN"
);
}

#[test]
fn runner_settings_zero_caps_become_none_unlimited() {
let cfg = parse_runner_section(
Expand Down
4 changes: 3 additions & 1 deletion crates/ironclaw_reborn_config/src/config_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,9 @@ pub struct HarnessSection {
pub struct RunnerSection {
pub heartbeat_interval_secs: Option<u64>,
pub poll_interval_ms: Option<u64>,
/// Number of concurrent turn-runner worker tasks. `None` or `0` defaults to 4. Clamped to 32.
/// Number of concurrent turn-runner worker tasks. `None` or `0` defaults to
/// 16 (`DEFAULT_TURN_RUNNER_WORKER_COUNT`); clamped to 32. `1` is accepted
/// but serializes all runs through one slot, so prefer `>= 2`.
pub worker_count: Option<usize>,
/// Max concurrent runs in `TurnStatus::Running` per (tenant_id, owner user_id). `None` or `0` = unlimited.
pub max_concurrent_runs_per_user: Option<u32>,
Expand Down
Loading
Loading