Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 123 additions & 2 deletions crates/eros-engine-llm/src/model_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,12 @@ pub struct ResolvedModel {
/// Resolved reasoning config (see `TaskConfig::reasoning`). `None` → omit
/// the wire param; `Some(cfg)` → forwarded as the `reasoning` object.
pub reasoning: Option<ReasoningConfig>,
/// Number of fallback models the chat burst may try after the primary.
/// `fallback_model` is already truncated to this length by `resolve()`.
/// Task-level → tier override precedence, default 2 (primary + 2 fallbacks
/// = 3-entry chain, matching the prior `MAX_STREAM_FALLBACK_DEPTH = 3`
/// hard-cap).
pub retry_depth: u32,
}

/// Resolved output-filter parameters for a chat request.
Expand All @@ -449,6 +455,10 @@ pub struct ResolvedOutputFilter {
pub trigger: OutputFilterTrigger,
pub timing: FilterTiming,
pub retry_depth: u32,
/// Reasoning config forwarded from `[tasks.chat_output_filter]`. Task-level
/// only (no per-tier override), consistent with `chat_companion`'s own
/// `reasoning` field shape.
pub reasoning: Option<ReasoningConfig>,
}

impl ModelConfig {
Expand Down Expand Up @@ -545,13 +555,22 @@ impl ModelConfig {
// Task-level only (tiers inherit), mirroring temperature/max_tokens.
let reasoning = task_cfg.and_then(|t| t.reasoning.clone());

// retry_depth: tier > task > default 2. Truncate fallback_model to
// retry_depth entries so the caller never needs to cap the chain.
let retry_depth = tier_cfg
.and_then(|t| t.retry_depth)
.or_else(|| task_cfg.and_then(|t| t.retry_depth))
.unwrap_or(2);
fallback_model.truncate(retry_depth as usize);

ResolvedModel {
model,
fallback_model,
temperature,
max_tokens,
allow_traits,
reasoning,
retry_depth,
}
}

Expand Down Expand Up @@ -614,11 +633,17 @@ impl ModelConfig {
.or(task_cfg.retry_depth)
.unwrap_or(1); // default 1: primary + first fallback only

// reasoning: task-level only (no per-tier override), consistent with
// chat_companion's own reasoning field.
let reasoning = task_cfg.reasoning.clone();

// model / fallback / temperature / max_tokens via the existing resolver
// (tier → default block → [defaults] → compiled-in).
// (tier → default block → [defaults] → compiled-in). Note: resolve()
// now truncates fallback_model to its own retry_depth; we re-truncate
// to chat_output_filter's retry_depth (which may differ).
let m = self.resolve(FILTER_TASK, tier);
let mut fallback_model = m.fallback_model;
fallback_model.truncate(retry_depth as usize); // cap to retry_depth entries
fallback_model.truncate(retry_depth as usize); // cap to filter's retry_depth entries
Some(ResolvedOutputFilter {
model: m.model,
fallback_model,
Expand All @@ -628,6 +653,7 @@ impl ModelConfig {
trigger,
timing,
retry_depth,
reasoning,
})
}
}
Expand Down Expand Up @@ -1701,6 +1727,101 @@ trigger = { traits = { any = ["a"] } }
assert_eq!(v, serde_json::json!({}));
}

// ─── Item 1: reasoning threaded through resolve_output_filter ─────────

#[test]
fn resolve_output_filter_threads_reasoning() {
let cfg: ModelConfig = toml::from_str(
r#"
[tasks.chat_companion]
output_filter = true
model = "x/y"

[tasks.chat_output_filter]
model = "filter/m"
filter_prompt = "rewrite"
reasoning = { enabled = false }
"#,
)
.unwrap();
let resolved = cfg.resolve_output_filter(None).expect("filter resolved");
assert!(resolved.reasoning.is_some());
}

#[test]
fn resolve_output_filter_reasoning_absent_is_none() {
let cfg: ModelConfig = toml::from_str(
r#"
[tasks.chat_companion]
output_filter = true
model = "x/y"

[tasks.chat_output_filter]
model = "filter/m"
filter_prompt = "rewrite"
"#,
)
.unwrap();
let resolved = cfg.resolve_output_filter(None).expect("filter resolved");
assert!(resolved.reasoning.is_none());
}

// ─── Item 2: chat_companion retry_depth ───────────────────────────────

#[test]
fn resolve_chat_companion_retry_depth_defaults_to_2() {
let cfg: ModelConfig = toml::from_str(
r#"
[tasks.chat_companion]
model = "x/y"
fallback = ["a/b", "c/d", "e/f", "g/h"]
"#,
)
.unwrap();
let r = cfg.resolve("chat_companion", None);
assert_eq!(r.retry_depth, 2);
// fallback truncated to retry_depth entries
assert_eq!(r.fallback_model, vec!["a/b".to_string(), "c/d".to_string()]);
}

#[test]
fn resolve_chat_companion_retry_depth_overridable() {
let cfg: ModelConfig = toml::from_str(
r#"
[tasks.chat_companion]
model = "x/y"
fallback = ["a/b", "c/d", "e/f"]
retry_depth = 3
"#,
)
.unwrap();
let r = cfg.resolve("chat_companion", None);
assert_eq!(r.retry_depth, 3);
assert_eq!(
r.fallback_model,
vec!["a/b".to_string(), "c/d".to_string(), "e/f".to_string()]
);
}

#[test]
fn resolve_chat_companion_retry_depth_tier_overrides_task() {
let cfg: ModelConfig = toml::from_str(
r#"
[tasks.chat_companion]
model = "x/y"
fallback = ["a/b", "c/d", "e/f"]
retry_depth = 2

[tasks.chat_companion.tiers.gold]
retry_depth = 1
"#,
)
.unwrap();
let r = cfg.resolve("chat_companion", Some("gold"));
assert_eq!(r.retry_depth, 1);
assert_eq!(r.fallback_model, vec!["a/b".to_string()]);
}

#[test]
fn resolve_output_filter_gating() {
use super::*;
Expand Down
10 changes: 3 additions & 7 deletions crates/eros-engine-server/src/pipeline/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,12 +86,6 @@ pub fn ulid_string(u: Ulid) -> String {
u.to_string()
}

/// Maximum number of model attempts per streaming burst (= 1 primary + up to
/// 2 fallbacks). Each attempt surfaces as a separate visible bubble; the
/// frontend masks attempts beyond the first behind a "thinking" affordance, so
/// a depth of 3 buys extra resilience without looking like a bug to users.
pub const MAX_STREAM_FALLBACK_DEPTH: usize = 3;

use std::sync::Arc;
use uuid::Uuid;

Expand Down Expand Up @@ -146,10 +140,11 @@ fn drive_chat_burst(
) -> impl futures_util::Stream<Item = ProtocolFrame> + Send + 'static {
async_stream::stream! {
let chat_repo = ChatRepo { pool: &state.pool };
// The fallback_model is already truncated to retry_depth entries by
// resolve() — no cap needed here; the chain is just [primary] + fallbacks.
let chain: Vec<String> = std::iter::once(req.model.clone())
.chain(req.fallback_model.iter().cloned())
.filter(|s| !s.is_empty())
.take(MAX_STREAM_FALLBACK_DEPTH)
.collect();
if chain.is_empty() {
yield ProtocolFrame::Error {
Expand Down Expand Up @@ -467,6 +462,7 @@ async fn run_output_filter(
],
temperature: f.temperature as f32,
max_tokens: f.max_tokens,
reasoning: f.reasoning.clone(),
..Default::default()
};
const FILTER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(15);
Expand Down
12 changes: 10 additions & 2 deletions docs/model-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ If any condition is unmet the filter is **inert** — the original reply is deli

```toml
[tasks.chat_output_filter]
model = "anthropic/claude-haiku-4.5" # fast model recommended
fallback = ["deepseek/deepseek-v4-flash", "x/y"]
model = "openai/gpt-5.4-nano"
fallback = ["google/gemini-3.1-flash", "zhipuai/zlm-4.7-flash"]
retry_depth = 1 # fallbacks to try on filter failure (default 1 = primary + first fallback)
temperature = 0.3
max_tokens = 400
Expand All @@ -126,6 +126,14 @@ timing = "after_extract" # or "before_extract"
filter_prompt = "..." # any field is optional; falls back to the default block
```

**Recommended models for `chat_output_filter`:**

- **Primary**: `openai/gpt-5.4-nano` — fast, stable filtered output.
- **DO NOT** use `openai/gpt-4.1-nano` as the filter model — empirically returns `"对不起,无法满足你的要求"`-style refusals with HTTP 200, which the engine cannot distinguish from a successful filtered rewrite, so the fail-open path never triggers and the user sees the refusal text.
- **Recommended fallback**: `google/gemini-3.1-flash` — high success rate; when it does fail it surfaces a proper error response (non-200), letting the engine's fail-open path kick in and emit the original reply.
- **Cost-saving fallback**: `zhipuai/zlm-4.7-flash` — cheaper, similar fail-mode profile to gemini-3.1-flash.
- **DO NOT** use `anthropic/claude-haiku-4.5` for the filter — its input tolerance for NSFW (great for extraction) does NOT extend to output; the safety alignment on the output side is strict enough that the filter LLM often refuses to produce rewritten text at all.

| Field | Type | Default | Notes |
|---|---|---|---|
| `model` | `String` \| `Array` \| `Table` | — | Primary filter model. Accepts the same three shapes as `chat_companion.model`. |
Expand Down
24 changes: 22 additions & 2 deletions examples/model_config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,29 @@ model_name_display_override = true
# The filter only runs if BOTH output_filter is true AND this table exists with
# a non-blank filter_prompt; otherwise it is inert. Only the FILTERED text is
# shown/stored; extract (memory/insight/affinity) reads the original by default.
#
# ── Recommended models for chat_output_filter ────────────────────────────
# Primary: openai/gpt-5.4-nano — fast, stable filtered output.
#
# DO NOT use openai/gpt-4.1-nano as the filter model — it empirically returns
# "对不起,无法满足你的要求"-style refusals with HTTP 200, which the engine
# cannot distinguish from a successful filtered rewrite, so the fail-open path
# never triggers and the user sees the refusal text.
#
# Recommended fallback: google/gemini-3.1-flash — high success rate; when it
# does fail it surfaces a proper error response (non-200), letting the engine's
# fail-open path kick in and emit the original reply.
#
# Cost-saving fallback: zhipuai/zlm-4.7-flash — cheaper, similar fail-mode
# profile to gemini-3.1-flash.
#
# DO NOT use anthropic/claude-haiku-4.5 for the filter — its input tolerance
# for NSFW (great for extraction) does NOT extend to output; the safety
# alignment on the output side is strict enough that the filter LLM often
# refuses to produce rewritten text at all.
#[tasks.chat_output_filter]
#model = "anthropic/claude-haiku-4.5" # fast model recommended
#fallback = ["deepseek/deepseek-v4-flash", "x/y"]
#model = "openai/gpt-5.4-nano"
#fallback = ["google/gemini-3.1-flash", "zhipuai/zlm-4.7-flash"]
#retry_depth = 1 # fallbacks to try on filter failure; default 1 (primary + first fallback)
#temperature = 0.3
#max_tokens = 400
Expand Down
Loading