[UPDATE] Update

mergennachin · mergennachin · commit 5f302ddef058 · 2026-06-09T13:50:59.000-07:00
[ghstack-poisoned]
diff --git a/extension/llm/runner/llm_session.h b/extension/llm/runner/llm_session.h
@@ -61,8 +61,11 @@ struct LLMServingCapacity {
   // sessions would copy the whole model); raise only on a backend proven to
   // share packed weights.
   int32_t max_physical_sessions_without_weight_duplication = 1;
-  // Planned bytes one session adds (KV + activations), for memory-budget
-  // admission. 0 = unknown; the server skips the memory clamp.
+  // Planned bytes one session adds (KV + activations). Reported for a FUTURE
+  // memory-budget admission policy; NOT yet enforced -- admission is currently
+  // by session COUNT only (--max-sessions). Over-provisioning therefore fails
+  // at the first execute (cudaMalloc) of the over-committed session, not at
+  // admit time. 0 = unknown.
   int64_t estimated_bytes_per_session = 0;
 };
 
@@ -79,14 +82,28 @@ class ET_EXPERIMENTAL LLMSession {
   /// `initial_sampling` (optional): the sampling config for the FIRST generated
   /// token, for backends that sample during prefill (e.g. in-graph sampling).
   /// Pass it so the first token uses the request's sampling instead of a stale
-  /// default. Backends that only sample in decode_one() ignore it.
+  /// default. Backends that only sample in decode_one() ignore it. NOTE:
+  /// because the first token is sampled here, it does NOT pass through
+  /// decode_one()'s logit processors -- a grammar/tool mask that must constrain
+  /// the opening token is not applied to it (a known limitation for
+  /// grammar-constrained serving).
+  ///
+  /// ERROR CONTRACT: an error may be returned AFTER backend state has already
+  /// mutated. On any error from prefill_tokens()/decode_one(), the session is
+  /// POISONED -- position() may no longer agree with the resident KV. The
+  /// caller must call reset() (and only proceed once it returns Ok) before any
+  /// further prefill/decode; it must NOT retry the failed call. The serving
+  /// worker enforces this (marks the session dirty and forces a reset next
+  /// request).
   virtual ::executorch::runtime::Error prefill_tokens(
       std::vector<uint64_t> tokens,
       const SamplingConfig* initial_sampling = nullptr) = 0;
 
   /// Decode one token from the pending state; looping reproduces a full
   /// generation while returning exact sampled token ids. A single decode_one()
   /// runs one forward pass and is not interruptible mid-call (see stop()).
+  /// On error the session is poisoned -- see the error contract on
+  /// prefill_tokens() (reset() before any further use; never retry).
   virtual ::executorch::runtime::Result<DecodeResult> decode_one(
       const SamplingConfig& sampling) = 0;
 
diff --git a/extension/llm/server/python/README.md b/extension/llm/server/python/README.md
@@ -160,6 +160,14 @@ Session capacity is determined by the worker/engine — a single worker hosts ma
 isolated sessions on one weight load — so `--num-runners` accepts 1; extra worker
 processes would each carry their own copy of the weights.
 
+The **generic `text_llm_worker` is scratch-only (V1)**: `TextLLMEngine::serving_capacity()`
+is a conservative 1, so `max_named = max(0, capacity-1) = 0` — the default
+`server.py` serves only the anonymous scratch session (no named `session_id`s, no
+warm resume). The named-session / warm-resume / token-ID machinery is exercised
+by a model-specific worker whose engine reports capacity > 1 (the Qwen3.5-MoE CUDA
+worker). This is intentional; the generic worker stays minimal until a backend is
+proven to host multiple physical sessions without duplicating weights.
+
 Cancellation is best-effort: a worker request runs to completion and is not
 interruptible mid-generation in V1, so `runner.stop()` means "the control plane
 stops consuming and the worker finishes the current request" rather than a hard
diff --git a/extension/llm/server/python/chat_template.py b/extension/llm/server/python/chat_template.py
@@ -25,6 +25,38 @@
 
 _DEFAULT_SPECIAL_TOKENS = ["<|im_end|>", "<|endoftext|>", "<|eot_id|>", "<|end|>"]
 
+# Chat turn terminators eligible to be used as generation stop strings. This is a
+# deliberate allowlist of end-of-turn / end-of-text tokens -- NOT the tokenizer's
+# full special-token set. Structural/tool delimiters (e.g. <tool_call>) must reach
+# the tool parser, so they are intentionally excluded: using them as hard stops
+# would truncate a tool call before it is ever parsed.
+_TURN_TERMINATORS = (
+    "<|im_end|>",
+    "<|endoftext|>",
+    "<|eot_id|>",
+    "<|end|>",
+    "<|end_of_text|>",
+    "<end_of_turn>",
+    "</s>",
+)
+
+
+def _content_text(content) -> str:
+    """Best-effort text for the ChatML fallback: a str as-is, or the concatenated
+    text parts of an OpenAI list-content message (non-text parts dropped). Avoids
+    rendering a Python repr of structured content. None -> empty string."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        out = []
+        for part in content:
+            if isinstance(part, dict) and part.get("type") == "text":
+                out.append(str(part.get("text", "")))
+            elif isinstance(part, str):
+                out.append(part)
+        return "".join(out)
+    return str(content or "")
+
 
 def _decode_tool_call_arguments(messages: list[dict[str, Any]]) -> None:
     """In-place: parse each tool call's ``function.arguments`` from a JSON string
@@ -120,25 +152,64 @@ def count_tokens(self, prompt: str) -> Optional[int]:
             return len(self._hf.encode(prompt, add_special_tokens=False))
         return None
 
-    def special_tokens(self) -> list[str]:
-        """Special-token strings whose appearance ends the visible content.
+    def turn_stop_sequences(self) -> list[str]:
+        """Generation stop strings: model/template-specific *turn terminators*
+        only -- the tokenizer's EOS plus known chat turn-end tokens -- NOT the
+        full special-token set.
+
+        Structural/tool delimiters (e.g. <tool_call>) are deliberately excluded:
+        if a tokenizer registers them as special, using the whole special set as
+        hard stops would halt generation at the delimiter and truncate the tool
+        call before the parser ever sees it. Whitespace-only tokens are dropped.
+        User-supplied request `stop` strings are handled separately and are not
+        affected by this set.
+
+        May return [] if the tokenizer has no eos_token and registers none of the
+        known terminators as special; in that case end-of-turn detection relies
+        entirely on the worker's EOS-by-token-id check (e.g. the Qwen engine adds
+        <|im_end|> to eos_ids), so the string set here is only a backstop.
+        """
+        if self._hf is None:
+            return list(_DEFAULT_SPECIAL_TOKENS)
+        specials = {
+            t
+            for t in (getattr(self._hf, "all_special_tokens", []) or [])
+            if isinstance(t, str) and t.strip()
+        }
+        out: list[str] = []
+        eos = getattr(self._hf, "eos_token", None)
+        if isinstance(eos, str) and eos.strip():
+            out.append(eos)
+        for t in _TURN_TERMINATORS:
+            if t in specials and t not in out:
+                out.append(t)
+        return out
 
-        From the HF tokenizer when available (model-accurate), else a default set
-        covering common chat models.
+    def special_tokens(self) -> list[str]:
+        """ALL special-token strings, for final content cleanup -- stripping any
+        special token that leaked into visible output. Deliberately broad, and
+        distinct from turn_stop_sequences(): this set must NOT be used as
+        generation stops or pre-parse truncation (that would halt/cut a tool call
+        at a structural delimiter), only to scrub trailing specials from the
+        already-parsed visible content. Whitespace-only tokens are dropped so a
+        stray '  ' token can't truncate content at the first double space.
         """
         if self._hf is not None:
             toks = list(getattr(self._hf, "all_special_tokens", []) or [])
-            return [t for t in toks if isinstance(t, str) and t]
+            return [t for t in toks if isinstance(t, str) and t.strip()]
         return list(_DEFAULT_SPECIAL_TOKENS)
 
     @staticmethod
     def _fallback(messages: list[ChatMessage]) -> str:
-        # Approximate ChatML. Provide --hf-tokenizer for model-correct formatting
-        # (including reasoning controls like enable_thinking, which the fallback
-        # cannot reproduce).
+        # Approximate ChatML, TEXT-ONLY. Provide --hf-tokenizer for model-correct
+        # formatting (reasoning controls like enable_thinking, and structured
+        # tool/multimodal turns, which this fallback cannot reproduce). This path
+        # renders only text content: assistant `tool_calls` and a tool-role
+        # `tool_call_id` are dropped, so it is NOT a correctness path for tool or
+        # multimodal conversations -- use a real --hf-tokenizer for those.
         parts = []
         for m in messages:
-            content = m.content if isinstance(m.content, str) else str(m.content or "")
+            content = _content_text(m.content)
             parts.append(f"<|im_start|>{m.role}\n{content}<|im_end|>")
         parts.append("<|im_start|>assistant\n")
         return "\n".join(parts)
diff --git a/extension/llm/server/python/serving_chat.py b/extension/llm/server/python/serving_chat.py
@@ -61,9 +61,16 @@ def __init__(
         # Detector CLASS; a fresh instance is created per request so streaming
         # state is never shared across concurrent requests.
         self._tool_detector_cls = tool_detector_cls
-        # Special tokens (e.g. <|im_end|>) the runner decodes to text; we cut the
-        # visible content at the first one so they don't leak into responses.
-        self._stops = template.special_tokens()
+        # Two distinct sets (see chat_template):
+        #  * _stops: NARROW turn terminators (e.g. <|im_end|>) used as generation
+        #    stops AND for pre-parse truncation (_options/_collect_until_stop/
+        #    _truncate_raw/_clean). Excludes structural/tool delimiters so a
+        #    <tool_call> is never halted or cut before _extract_tools sees it.
+        #  * _content_specials: BROAD all-special-tokens set, used ONLY by
+        #    _strip_specials for final cleanup of the already-parsed visible
+        #    content, so a stray special token can't leak to the user.
+        self._stops = template.turn_stop_sequences()
+        self._content_specials = template.special_tokens()
 
     @staticmethod
     def _tool_schemas(req: ChatCompletionRequest) -> dict[str, dict]:
@@ -80,7 +87,9 @@ def _tool_schemas(req: ChatCompletionRequest) -> dict[str, dict]:
         return schemas
 
     def _strip_specials(self, text: str) -> str:
-        cut = _earliest_stop(text, self._stops)
+        # Broad set: scrub ANY special token that leaked into already-parsed
+        # visible content (not the narrow generation-stop set).
+        cut = _earliest_stop(text, self._content_specials)
         return text[:cut] if cut is not None else text
 
     @staticmethod
diff --git a/extension/llm/server/python/tests/test_hermes_tool_parser.py b/extension/llm/server/python/tests/test_hermes_tool_parser.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Tests for HermesDetector (Hermes/Qwen JSON <tool_call> format).
+
+Covers the explicit all-or-nothing malformed-call policy and the no-markup-leak
+guarantee: an undefined/malformed/truncated call degrades to the leading text
+with the <tool_call> markup stripped, never surfaced to the client.
+"""
+
+import json
+
+from executorch.extension.llm.server.python.tool_parsers import HermesDetector
+
+_TOOLS = {
+    "get_weather": {"type": "object", "properties": {"city": {"type": "string"}}},
+    "echo": {"type": "object", "properties": {"text": {"type": "string"}}},
+}
+
+
+def _parse(text, tools=_TOOLS):
+    return HermesDetector().detect_and_parse(text, tools)
+
+
+def test_basic_call():
+    text = (
+        '<tool_call>{"name": "get_weather", "arguments": {"city": "Paris"}}</tool_call>'
+    )
+    r = _parse(text)
+    assert len(r.calls) == 1 and r.calls[0].name == "get_weather"
+    assert json.loads(r.calls[0].arguments) == {"city": "Paris"}
+
+
+def test_multiple_calls_still_parse():
+    text = (
+        '<tool_call>{"name": "echo", "arguments": {"text": "a"}}</tool_call>'
+        '<tool_call>{"name": "echo", "arguments": {"text": "b"}}</tool_call>'
+    )
+    r = _parse(text)
+    assert [json.loads(c.arguments)["text"] for c in r.calls] == ["a", "b"]
+
+
+def test_no_tool_call_is_passthrough():
+    r = _parse("just some text")
+    assert not r.calls and r.normal_text == "just some text"
+
+
+def test_malformed_block_with_valid_sibling_degrades_no_leak():
+    # All-or-nothing: one malformed block degrades the WHOLE response (the valid
+    # sibling is NOT emitted in isolation), and no <tool_call> markup leaks.
+    text = (
+        'lead<tool_call>{"name": "echo", "arguments": {"text": "ok"}}</tool_call>'
+        "<tool_call>{bad json}</tool_call>"
+    )
+    r = _parse(text)
+    assert not r.calls
+    assert "<tool_call>" not in r.normal_text
+    assert r.normal_text == "lead"
+
+
+def test_unclosed_marker_degrades_no_leak():
+    text = 'lead<tool_call>{"name": "echo", "arguments": {"text": "x"}}'
+    r = _parse(text)
+    assert not r.calls
+    assert "<tool_call>" not in r.normal_text
+    assert r.normal_text == "lead"
+
+
+def test_string_value_containing_close_marker_not_truncated():
+    # A JSON string value containing literal </tool_call> must not truncate the
+    # captured JSON (raw_decode parses the whole object regardless).
+    text = (
+        '<tool_call>{"name": "echo", "arguments": '
+        '{"text": "a </tool_call> b"}}</tool_call>'
+    )
+    r = _parse(text)
+    assert len(r.calls) == 1
+    assert json.loads(r.calls[0].arguments) == {"text": "a </tool_call> b"}
+
+
+def test_arguments_null_falls_back_to_parameters():
+    text = (
+        '<tool_call>{"name": "echo", "arguments": null, '
+        '"parameters": {"text": "p"}}</tool_call>'
+    )
+    r = _parse(text)
+    assert json.loads(r.calls[0].arguments) == {"text": "p"}
+
+
+def test_undefined_tool_degrades_to_full_text():
+    # A WELL-FORMED call to an undefined tool degrades the whole response to
+    # visible text (unchanged policy: surface the model's intent, never a partial
+    # set). This differs from the malformed/truncated case, which strips markup.
+    text = 'hi<tool_call>{"name": "nope", "arguments": {}}</tool_call>'
+    r = _parse(text)
+    assert not r.calls
+    assert "<tool_call>" in r.normal_text  # full text, markup visible
diff --git a/extension/llm/server/python/tests/test_qwen_tool_parser.py b/extension/llm/server/python/tests/test_qwen_tool_parser.py
@@ -124,3 +124,95 @@ def test_untyped_param_falls_back_to_json_guess():
     )
     r = _parse(text, tools)
     assert json.loads(r.calls[0].arguments) == {"n": 42, "items": [1, 2]}
+
+
+_TYPED = {
+    "code_tool": {"type": "object", "properties": {"code": {"type": "string"}}},
+    "calc": {
+        "type": "object",
+        "properties": {
+            "n": {"type": "integer"},
+            "x": {"type": "number"},
+            "flag": {"type": "boolean"},
+        },
+    },
+}
+
+
+def test_param_value_with_literal_parameter_close():
+    # A value containing literal </parameter> must be preserved, not truncated.
+    text = "<function=code_tool><parameter=code>a </parameter> b</parameter></function>"
+    r = _parse(text, _TYPED)
+    assert json.loads(r.calls[0].arguments) == {"code": "a </parameter> b"}
+
+
+def test_param_value_with_function_markup():
+    # A value containing <function=...> markup must stay in the value, not split.
+    text = (
+        "<function=code_tool><parameter=code>x = <function=foo></parameter></function>"
+    )
+    r = _parse(text, _TYPED)
+    assert len(r.calls) == 1
+    assert json.loads(r.calls[0].arguments) == {"code": "x = <function=foo>"}
+
+
+def test_declared_integer_with_float_string_kept_raw():
+    text = "<function=calc><parameter=n>10.0</parameter></function>"
+    val = json.loads(_parse(text, _TYPED).calls[0].arguments)["n"]
+    assert val == "10.0" and isinstance(val, str)  # not float 10.0
+
+
+def test_declared_boolean_with_one_kept_raw():
+    text = "<function=calc><parameter=flag>1</parameter></function>"
+    val = json.loads(_parse(text, _TYPED).calls[0].arguments)["flag"]
+    assert val == "1" and isinstance(val, str)  # not int 1
+
+
+def test_declared_integer_with_underscores_kept_raw():
+    text = "<function=calc><parameter=n>1_000</parameter></function>"
+    val = json.loads(_parse(text, _TYPED).calls[0].arguments)["n"]
+    assert val == "1_000" and isinstance(val, str)  # not int 1000
+
+
+def _reject_bare_constant(c):
+    # json.loads parse_constant hook: fires only for bare NaN/Infinity/-Infinity.
+    raise AssertionError(f"emitted bare non-finite constant: {c}")
+
+
+def test_declared_number_non_finite_never_emitted():
+    for bad in ("NaN", "Infinity", "-Infinity", "1e999"):
+        text = f"<function=calc><parameter=x>{bad}</parameter></function>"
+        args = _parse(text, _TYPED).calls[0].arguments
+        # Strict-client safe: no bare NaN/Infinity constant in the emitted JSON.
+        json.loads(args, parse_constant=_reject_bare_constant)
+        assert json.loads(args)["x"] == bad  # kept as the raw string
+
+
+def test_multiple_valid_calls_still_parse():
+    text = (
+        "<function=add><parameter=a>1</parameter><parameter=b>2</parameter></function>"
+        "<function=add><parameter=a>3</parameter><parameter=b>4</parameter></function>"
+    )
+    r = _parse(text)
+    assert [json.loads(c.arguments) for c in r.calls] == [
+        {"a": 1, "b": 2},
+        {"a": 3, "b": 4},
+    ]
+
+
+def test_truncated_call_degrades_without_leaking_markup():
+    # A call cut off by max_tokens (no closing tags) must NOT leak the partial
+    # <function=...> markup -- only the leading text survives (mirrors Hermes).
+    text = "Sure! <function=get_weather><parameter=city>Paris"
+    r = _parse(text, _TYPED)
+    assert not r.calls
+    assert "<function=" not in r.normal_text
+    assert r.normal_text == "Sure!"
+
+
+def test_truncated_tool_call_wrapper_no_leak():
+    text = "ok <tool_call>\n<function=get_weather><parameter=city>Par"
+    r = _parse(text, _TYPED)
+    assert not r.calls
+    assert "<tool_call>" not in r.normal_text and "<function=" not in r.normal_text
+    assert r.normal_text == "ok"
diff --git a/extension/llm/server/python/tests/test_template.py b/extension/llm/server/python/tests/test_template.py
diff --git a/extension/llm/server/python/tool_parsers/hermes.py b/extension/llm/server/python/tool_parsers/hermes.py
diff --git a/extension/llm/server/python/tool_parsers/qwen.py b/extension/llm/server/python/tool_parsers/qwen.py