etherfunlab · enriquephl · May 26, 2026 · May 26, 2026 · May 26, 2026
diff --git a/crates/eros-engine-llm/src/model_config.rs b/crates/eros-engine-llm/src/model_config.rs
@@ -432,6 +432,12 @@ pub struct ResolvedModel {
     /// Resolved reasoning config (see `TaskConfig::reasoning`). `None` → omit
     /// the wire param; `Some(cfg)` → forwarded as the `reasoning` object.
     pub reasoning: Option<ReasoningConfig>,
+    /// Number of fallback models the chat burst may try after the primary.
+    /// `fallback_model` is already truncated to this length by `resolve()`.
+    /// Task-level → tier override precedence, default 2 (primary + 2 fallbacks
+    /// = 3-entry chain, matching the prior `MAX_STREAM_FALLBACK_DEPTH = 3`
+    /// hard-cap).
+    pub retry_depth: u32,
 }
 
 /// Resolved output-filter parameters for a chat request.
@@ -449,6 +455,10 @@ pub struct ResolvedOutputFilter {
     pub trigger: OutputFilterTrigger,
     pub timing: FilterTiming,
     pub retry_depth: u32,
+    /// Reasoning config forwarded from `[tasks.chat_output_filter]`. Task-level
+    /// only (no per-tier override), consistent with `chat_companion`'s own
+    /// `reasoning` field shape.
+    pub reasoning: Option<ReasoningConfig>,
 }
 
 impl ModelConfig {
@@ -545,13 +555,22 @@ impl ModelConfig {
         // Task-level only (tiers inherit), mirroring temperature/max_tokens.
         let reasoning = task_cfg.and_then(|t| t.reasoning.clone());
 
+        // retry_depth: tier > task > default 2. Truncate fallback_model to
+        // retry_depth entries so the caller never needs to cap the chain.
+        let retry_depth = tier_cfg
+            .and_then(|t| t.retry_depth)
+            .or_else(|| task_cfg.and_then(|t| t.retry_depth))
+            .unwrap_or(2);
+        fallback_model.truncate(retry_depth as usize);
+
         ResolvedModel {
             model,
             fallback_model,
             temperature,
             max_tokens,
             allow_traits,
             reasoning,
+            retry_depth,
         }
     }
 
@@ -614,11 +633,17 @@ impl ModelConfig {
             .or(task_cfg.retry_depth)
             .unwrap_or(1); // default 1: primary + first fallback only
 
+        // reasoning: task-level only (no per-tier override), consistent with
+        // chat_companion's own reasoning field.
+        let reasoning = task_cfg.reasoning.clone();
+
         // model / fallback / temperature / max_tokens via the existing resolver
-        // (tier → default block → [defaults] → compiled-in).
+        // (tier → default block → [defaults] → compiled-in). Note: resolve()
+        // now truncates fallback_model to its own retry_depth; we re-truncate
+        // to chat_output_filter's retry_depth (which may differ).
         let m = self.resolve(FILTER_TASK, tier);
         let mut fallback_model = m.fallback_model;
-        fallback_model.truncate(retry_depth as usize); // cap to retry_depth entries
+        fallback_model.truncate(retry_depth as usize); // cap to filter's retry_depth entries
         Some(ResolvedOutputFilter {
             model: m.model,
             fallback_model,
@@ -628,6 +653,7 @@ impl ModelConfig {
             trigger,
             timing,
             retry_depth,
+            reasoning,
         })
     }
 }
@@ -1701,6 +1727,101 @@ trigger = { traits = { any = ["a"] } }
         assert_eq!(v, serde_json::json!({}));
     }
 
+    // ─── Item 1: reasoning threaded through resolve_output_filter ─────────
+
+    #[test]
+    fn resolve_output_filter_threads_reasoning() {
+        let cfg: ModelConfig = toml::from_str(
+            r#"
+[tasks.chat_companion]
+output_filter = true
+model = "x/y"
+
+[tasks.chat_output_filter]
+model = "filter/m"
+filter_prompt = "rewrite"
+reasoning = { enabled = false }
+"#,
+        )
+        .unwrap();
+        let resolved = cfg.resolve_output_filter(None).expect("filter resolved");
+        assert!(resolved.reasoning.is_some());
+    }
+
+    #[test]
+    fn resolve_output_filter_reasoning_absent_is_none() {
+        let cfg: ModelConfig = toml::from_str(
+            r#"
+[tasks.chat_companion]
+output_filter = true
+model = "x/y"
+
+[tasks.chat_output_filter]
+model = "filter/m"
+filter_prompt = "rewrite"
+"#,
+        )
+        .unwrap();
+        let resolved = cfg.resolve_output_filter(None).expect("filter resolved");
+        assert!(resolved.reasoning.is_none());
+    }
+
+    // ─── Item 2: chat_companion retry_depth ───────────────────────────────
+
+    #[test]
+    fn resolve_chat_companion_retry_depth_defaults_to_2() {
+        let cfg: ModelConfig = toml::from_str(
+            r#"
+[tasks.chat_companion]
+model = "x/y"
+fallback = ["a/b", "c/d", "e/f", "g/h"]
+"#,
+        )
+        .unwrap();
+        let r = cfg.resolve("chat_companion", None);
+        assert_eq!(r.retry_depth, 2);
+        // fallback truncated to retry_depth entries
+        assert_eq!(r.fallback_model, vec!["a/b".to_string(), "c/d".to_string()]);
+    }
+
+    #[test]
+    fn resolve_chat_companion_retry_depth_overridable() {
+        let cfg: ModelConfig = toml::from_str(
+            r#"
+[tasks.chat_companion]
+model = "x/y"
+fallback = ["a/b", "c/d", "e/f"]
+retry_depth = 3
+"#,
+        )
+        .unwrap();
+        let r = cfg.resolve("chat_companion", None);
+        assert_eq!(r.retry_depth, 3);
+        assert_eq!(
+            r.fallback_model,
+            vec!["a/b".to_string(), "c/d".to_string(), "e/f".to_string()]
+        );
+    }
+
+    #[test]
+    fn resolve_chat_companion_retry_depth_tier_overrides_task() {
+        let cfg: ModelConfig = toml::from_str(
+            r#"
+[tasks.chat_companion]
+model = "x/y"
+fallback = ["a/b", "c/d", "e/f"]
+retry_depth = 2
+
+[tasks.chat_companion.tiers.gold]
+retry_depth = 1
+"#,
+        )
+        .unwrap();
+        let r = cfg.resolve("chat_companion", Some("gold"));
+        assert_eq!(r.retry_depth, 1);
+        assert_eq!(r.fallback_model, vec!["a/b".to_string()]);
+    }
+
     #[test]
     fn resolve_output_filter_gating() {
         use super::*;

diff --git a/crates/eros-engine-server/src/pipeline/stream.rs b/crates/eros-engine-server/src/pipeline/stream.rs
@@ -86,12 +86,6 @@ pub fn ulid_string(u: Ulid) -> String {
     u.to_string()
 }
 
-/// Maximum number of model attempts per streaming burst (= 1 primary + up to
-/// 2 fallbacks). Each attempt surfaces as a separate visible bubble; the
-/// frontend masks attempts beyond the first behind a "thinking" affordance, so
-/// a depth of 3 buys extra resilience without looking like a bug to users.
-pub const MAX_STREAM_FALLBACK_DEPTH: usize = 3;
-
 use std::sync::Arc;
 use uuid::Uuid;
 
@@ -146,10 +140,11 @@ fn drive_chat_burst(
 ) -> impl futures_util::Stream<Item = ProtocolFrame> + Send + 'static {
     async_stream::stream! {
         let chat_repo = ChatRepo { pool: &state.pool };
+        // The fallback_model is already truncated to retry_depth entries by
+        // resolve() — no cap needed here; the chain is just [primary] + fallbacks.
         let chain: Vec<String> = std::iter::once(req.model.clone())
             .chain(req.fallback_model.iter().cloned())
             .filter(|s| !s.is_empty())
-            .take(MAX_STREAM_FALLBACK_DEPTH)
             .collect();
         if chain.is_empty() {
             yield ProtocolFrame::Error {
@@ -467,6 +462,7 @@ async fn run_output_filter(
         ],
         temperature: f.temperature as f32,
         max_tokens: f.max_tokens,
+        reasoning: f.reasoning.clone(),
         ..Default::default()
     };
     const FILTER_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(15);

diff --git a/docs/model-config.md b/docs/model-config.md
@@ -110,8 +110,8 @@ If any condition is unmet the filter is **inert** — the original reply is deli
 
 ```toml
 [tasks.chat_output_filter]
-model        = "anthropic/claude-haiku-4.5"   # fast model recommended
-fallback     = ["deepseek/deepseek-v4-flash", "x/y"]
+model        = "openai/gpt-5.4-nano"
+fallback     = ["google/gemini-3.1-flash", "zhipuai/zlm-4.7-flash"]
 retry_depth  = 1     # fallbacks to try on filter failure (default 1 = primary + first fallback)
 temperature  = 0.3
 max_tokens   = 400
@@ -126,6 +126,14 @@ timing       = "after_extract"   # or "before_extract"
 filter_prompt = "..."            # any field is optional; falls back to the default block
 ```
 
+**Recommended models for `chat_output_filter`:**
+
+- **Primary**: `openai/gpt-5.4-nano` — fast, stable filtered output.
+- **DO NOT** use `openai/gpt-4.1-nano` as the filter model — empirically returns `"对不起，无法满足你的要求"`-style refusals with HTTP 200, which the engine cannot distinguish from a successful filtered rewrite, so the fail-open path never triggers and the user sees the refusal text.
+- **Recommended fallback**: `google/gemini-3.1-flash` — high success rate; when it does fail it surfaces a proper error response (non-200), letting the engine's fail-open path kick in and emit the original reply.
+- **Cost-saving fallback**: `zhipuai/zlm-4.7-flash` — cheaper, similar fail-mode profile to gemini-3.1-flash.
+- **DO NOT** use `anthropic/claude-haiku-4.5` for the filter — its input tolerance for NSFW (great for extraction) does NOT extend to output; the safety alignment on the output side is strict enough that the filter LLM often refuses to produce rewritten text at all.
+
 | Field | Type | Default | Notes |
 |---|---|---|---|
 | `model` | `String` \| `Array` \| `Table` | — | Primary filter model. Accepts the same three shapes as `chat_companion.model`. |

diff --git a/examples/model_config.toml b/examples/model_config.toml
@@ -69,9 +69,29 @@ model_name_display_override = true
 # The filter only runs if BOTH output_filter is true AND this table exists with
 # a non-blank filter_prompt; otherwise it is inert. Only the FILTERED text is
 # shown/stored; extract (memory/insight/affinity) reads the original by default.
+#
+# ── Recommended models for chat_output_filter ────────────────────────────
+# Primary: openai/gpt-5.4-nano — fast, stable filtered output.
+#
+# DO NOT use openai/gpt-4.1-nano as the filter model — it empirically returns
+# "对不起，无法满足你的要求"-style refusals with HTTP 200, which the engine
+# cannot distinguish from a successful filtered rewrite, so the fail-open path
+# never triggers and the user sees the refusal text.
+#
+# Recommended fallback: google/gemini-3.1-flash — high success rate; when it
+# does fail it surfaces a proper error response (non-200), letting the engine's
+# fail-open path kick in and emit the original reply.
+#
+# Cost-saving fallback: zhipuai/zlm-4.7-flash — cheaper, similar fail-mode
+# profile to gemini-3.1-flash.
+#
+# DO NOT use anthropic/claude-haiku-4.5 for the filter — its input tolerance
+# for NSFW (great for extraction) does NOT extend to output; the safety
+# alignment on the output side is strict enough that the filter LLM often
+# refuses to produce rewritten text at all.
 #[tasks.chat_output_filter]
-#model        = "anthropic/claude-haiku-4.5"   # fast model recommended
-#fallback     = ["deepseek/deepseek-v4-flash", "x/y"]
+#model        = "openai/gpt-5.4-nano"
+#fallback     = ["google/gemini-3.1-flash", "zhipuai/zlm-4.7-flash"]
 #retry_depth  = 1     # fallbacks to try on filter failure; default 1 (primary + first fallback)
 #temperature  = 0.3
 #max_tokens   = 400