pytorch
diff --git a/‎backends/cuda/runtime/cuda_mutable_state.cpp‎
Lines changed: 1 addition & 1 deletion b/‎backends/cuda/runtime/cuda_mutable_state.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/qwen3_5_moe/README.md‎
Lines changed: 15 additions & 16 deletions b/‎examples/models/qwen3_5_moe/README.md‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎examples/models/qwen3_5_moe/qwen35_moe_engine.cpp‎
Lines changed: 4 additions & 4 deletions b/‎examples/models/qwen3_5_moe/qwen35_moe_engine.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎examples/models/qwen3_5_moe/qwen35_moe_engine.h‎
Lines changed: 5 additions & 5 deletions b/‎examples/models/qwen3_5_moe/qwen35_moe_engine.h‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎examples/models/qwen3_5_moe/qwen35_moe_worker.cpp‎
Lines changed: 2 additions & 3 deletions b/‎examples/models/qwen3_5_moe/qwen35_moe_worker.cpp‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/models/qwen3_5_moe/serve.py‎
Lines changed: 3 additions & 5 deletions b/‎examples/models/qwen3_5_moe/serve.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp‎
Lines changed: 2 additions & 2 deletions b/‎examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎extension/llm/server/README.md‎
Lines changed: 9 additions & 3 deletions b/‎extension/llm/server/README.md‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎extension/llm/server/cpp/worker_loop.h‎
Lines changed: 50 additions & 59 deletions b/‎extension/llm/server/cpp/worker_loop.h‎
Lines changed: 50 additions & 59 deletions
@@ -398,7 +398,7 @@ void mutable_state_set_active(MutableStateContext ctx, int token) {
 void mutable_state_note_handle(CudaDelegateHandle* handle) {
   MutableStateContext ctx = tl_loading_ctx;
   if (ctx == kInvalidMutableContext) {
-    return; // not loading within a managed context (e.g. non-V2 path)
+    return; // not loading within a managed context (single-session path)
   }
   auto& m = mgr();
   std::lock_guard<std::mutex> g(m.mu);
 
@@ -161,9 +161,12 @@ LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH \
     --data-path   qwen35_moe_exports/aoti_cuda_blob.ptd \
     --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
     --hf-tokenizer   ~/models/Qwen3.5-35B-A3B \
-    --model-id qwen3.5-moe --no-think
+    --model-id qwen3.5-moe --no-think --max-sessions 4
 ```
 
+`--max-sessions >= 2` is required for named sessions and warm resume; the default
+`1` is scratch-only (one slot is reserved for anonymous requests).
+
 ### Architecture (process isolation)
 
 Two processes, one model load:
@@ -202,16 +205,16 @@ is safe under asyncio.
 ### Sessions
 
 One worker loads the weights once (~18 GB) and hosts multiple **isolated**
-sessions on that single allocation — each with its own KV/recurrent state, via
-CUDA per-session mutable rebinding. Set `--max-sessions N` (clamped to 1 if the
-backend cannot rebind); one slot is reserved for anonymous requests, so up to
-`N - 1` named `session_id`s are addressable.
+sessions on that single allocation, each with its own KV/recurrent state. Set
+`--max-sessions N` (clamped to 1 if the backend hosts a single session); one slot
+is reserved for anonymous requests, so up to `N - 1` named `session_id`s are
+addressable.
 
 Route a request to a persistent session with the `session_id` body field or, as
 aliases, the `X-ExecuTorch-Session-ID` / `session_id` / `x-session-affinity`
 headers (body wins, then that header order). The header aliases let a client that
-already emits a stable per-conversation affinity id (e.g. pi's
-`sendSessionAffinityHeaders`) route with no extra config. Requests without any
+emits a stable per-conversation affinity id route per conversation (for pi, set
+`compat.sendSessionAffinityHeaders: true` in models.json). Requests without any
 share a transient scratch session.
 
 ```bash
@@ -243,15 +246,11 @@ Each `done` event reports
 (`new`/`exact_prefix`/`dirty`/`mismatch`/`equal`) for measuring the hit rate.
 `--no-warm-resume` forces a full prefill every request (for A/B comparison).
 
-**Tool-call turns (token-ID continuation):** an assistant turn re-rendered from
-its parsed tool call rarely re-tokenizes to the tokens the model actually
-generated, so plain warm resume misses on agent loops. The server stores the
-exact generated token ids per session and, on the next turn, sends the prompt as
-segments (`{"text"}` / `{"ids"}`) that splice those ids back in for prior
-assistant turns instead of re-rendering them — so the resident state stays an
-exact token prefix and resume hits. Tool *results* remain text (re-tokenized
-deterministically). The worker's exact-token check still backstops everything, so
-a mismatch just falls back to a full prefill.
+**Tool-call turns** also warm-resume: an assistant turn re-rendered from its
+parsed tool call rarely re-tokenizes to the tokens the model generated, so the
+server replays the exact generated token ids for prior turns to keep the resident
+state an exact prefix (tool *results* stay text). A mismatch still falls back to a
+full prefill.
 
 This is **isolation + warm resume, not concurrency**: execution is still
 synchronous (one in-flight request; `--num-runners > 1` is rejected since more
 
@@ -95,8 +95,8 @@ Result<std::unique_ptr<Module>> build_qwen_module(
 
 #ifdef EXECUTORCH_BUILD_CUDA
   // Backend options are read during backend init(), so they must be set before
-  // load_method. (CUDA graph is intentionally not enabled: V2 rebinds each
-  // session's mutable buffers before execute, which a captured graph's baked
+  // load_method. (CUDA graph is intentionally not enabled: each session
+  // rebinds its mutable buffers before execute, which a captured graph's baked
   // pointers would ignore.)
   {
     // Cross-method per-FQN weight sharing: prefill and decode reuse one weight
@@ -124,7 +124,7 @@ Error register_mutable_fqns(Module* module, int mutable_ctx) {
     ET_LOG(
         Error,
         "Qwen35MoEEngine: model has no get_mutable_buffer_metadata; re-export "
-        "for V2 multi-session");
+        "for multi-session");
     return res.error();
   }
   const auto& outs = res.get();
@@ -368,7 +368,7 @@ class Qwen35MoESession : public LLMSession {
   Error seek(int64_t pos) override {
     // The hybrid model carries recurrent/conv state that cannot be safely
     // rewound by logical position the way contiguous KV can. Fail closed so the
-    // prefix cache falls back to reset + full prefill (V1).
+    // prefix cache falls back to reset + full prefill.
     (void)pos;
     return Error::NotSupported;
   }
 
@@ -17,7 +17,7 @@
 // isolated points are where an MLX runtime would slot in. MLX is NOT
 // implemented or validated here.
 //
-// V2 (CUDA): the ENGINE is multi-session — one shared Module (weights loaded
+// CUDA: the ENGINE is multi-session — one shared Module (weights loaded
 // once); create_session() hands out multiple logical sessions, each rebinding
 // its own GPU buffers for the model's mutable state (KV/conv/recurrent) before
 // execute, serialized by the engine lock. serving_capacity() reports how many
@@ -26,9 +26,9 @@
 // backends/cuda/runtime/cuda_mutable_state).
 //
 // The SERVING path (qwen3_5_moe_worker + control plane) exposes this over the
-// worker protocol: the worker routes requests to per-session_id state (V2a) and
+// worker protocol: the worker routes requests to per-session_id state and
 // reuses each session's resident context across requests (warm append-only
-// resume, V2b.1). Execution stays serialized (one in-flight request).
+// resume). Execution stays serialized (one in-flight request).
 
 #pragma once
 
@@ -53,7 +53,7 @@ struct Qwen35MoEConfig {
   std::string model_path; // .pte
   std::string data_path; // .ptd (CUDA delegate blob); empty if none
   std::string tokenizer_path; // HuggingFace tokenizer.json
-  // V2 multi-session: max physical sessions to advertise when the backend can
+  // Multi-session: max physical sessions to advertise when the backend can
   // host them without weight duplication (CUDA per-session mutable rebinding).
   // Clamped to 1 if the backend cannot rebind.
   int32_t max_sessions = 1;
@@ -74,7 +74,7 @@ class ET_EXPERIMENTAL Qwen35MoEEngine : public LLMEngine {
   ::executorch::runtime::Result<std::unique_ptr<LLMSession>> create_session()
       override;
 
-  // CUDA V2: one shared Module (one weight allocation); each session rebinds
+  // CUDA: one shared Module (one weight allocation); each session rebinds
   // its own GPU buffers for the model's mutable state. Reports
   // config.max_sessions when the backend supports per-session rebinding, else
   // fails closed to 1.
 
@@ -14,9 +14,8 @@
 // protocol and decode loop every worker uses (worker_loop.h); this file only
 // constructs the engine/session.
 //
-// Isolation rationale: executing the AOTI CUDA model inside a live asyncio HTTP
-// process segfaults in the int4 matmul (validated). Here the model runs in a
-// plain synchronous loop in its own process, which is reliable.
+// Model execution is isolated in this C++ worker for CUDA/AOTI reliability (see
+// the example README for the full rationale).
 //
 // Multi-session: the engine loads weights once and hosts multiple isolated
 // sessions on that one ~18GB allocation; the shared worker loop (worker_loop.h)
 
@@ -12,10 +12,8 @@
 process (qwen3_5_moe_worker) that this process drives over JSONL via the generic
 WorkerClient — the same protocol the generic text_llm_worker speaks.
 
-Why two processes: executing the AOTI CUDA model inside a live asyncio server
-process segfaults in the int4 matmul (validated by elimination — the trigger is
-CUDA execution while a live asyncio loop is resident). Isolating CUDA in a plain
-(no-asyncio) C++ worker process is the reliable shape, and it loads weights once.
+Model execution is isolated in the C++ worker for CUDA/AOTI reliability; the
+worker loads weights once. (See the example README for the full rationale.)
 
 Sessions and constraints:
   * One worker hosts many isolated sessions on a single ~18GB weight load (CUDA
@@ -121,7 +119,7 @@ def _stop_worker():
 
 def main() -> None:
     p = argparse.ArgumentParser(
-        description="OpenAI-compatible LLM server for Qwen3.5 MoE (process-isolated, V1)"
+        description="OpenAI-compatible LLM server for Qwen3.5 MoE (process-isolated)"
     )
     p.add_argument("--model-path", required=True, help="Path to the .pte model")
     p.add_argument(
 
@@ -6,11 +6,11 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// GPU no-bleed integration proof for the CUDA V2 per-session mutable-state
+// GPU no-bleed integration proof for the CUDA per-session mutable-state
 // rebind -- the REAL guard for mutable-buffer completeness (an under-declared
 // buffer would be shared across sessions; only behavior catches that, not the
 // declared-subset-of-discovered bookkeeping check). This is the automated form
-// of the manual "A solo / A inter" proof in the V2 foundation commit.
+// of the manual "A solo / A inter" multi-session isolation proof.
 //
 // CRITICAL: sessions are interleaved at EXECUTE granularity (A prefill, B
 // prefill, A decode, B decode, ...). The mechanism under test is the
 
@@ -11,6 +11,11 @@ extension/llm/server/
   # cpp/         # future: no-Python single-binary server
 ```
 
+**Which entry point:** use `extension.llm.server.python.server` for generic
+TextLLM `.pte` models; use `examples.models.qwen3_5_moe.serve` for Qwen3.5-MoE
+CUDA (it needs the `.ptd` delegate blob, Qwen XML tool parsing, and the Qwen
+engine/session worker).
+
 Why this layout: the OpenAI contract is identical across languages, so the
 **spec** and **conformance** suite are shared, and each language gets its own
 implementation directory. The real cross-language reuse comes from the C++
@@ -26,8 +31,8 @@ Hugging Face chat templates (`--hf-tokenizer`), `temperature` / `max_tokens` /
 (`<tool_call>...</tool_call>` JSON, complete calls only; model-specific launchers
 may select the Qwen XML format) with `tool_choice="none"`,
 structured API errors, and best-effort cancellation. One worker process with
-serialized execution; it hosts many isolated sessions on one weight load (warm
-append-only resume across turns). KV/prefix state lives inside the
+serialized execution; a worker can host isolated sessions on one weight load when its engine reports
+capacity > 1 (with warm append-only resume across turns). KV/prefix state lives inside the
 worker/session, not the control plane. Unsupported params (including `top_p`,
 `seed`, `n>1`, `reasoning_effort`, penalties, `logit_bias`, `response_format`,
 `logprobs`, and `tool_choice="required"`) are rejected with a structured 400
@@ -63,7 +68,8 @@ Point pi at the server via `~/.pi/agent/models.json`:
 ```json
 { "providers": { "executorch": {
     "baseUrl": "http://127.0.0.1:8000/v1", "api": "openai-completions",
-    "apiKey": "x", "models": [ { "id": "<model-id>" } ] } } }
+    "apiKey": "x", "models": [ { "id": "<model-id>",
+      "compat": { "sendSessionAffinityHeaders": true } } ] } } }
 ```
 
 Other OpenAI-compatible clients use their own schema — generically: base URL
 
@@ -8,67 +8,47 @@
 
 #pragma once
 
-// Shared model-worker generation loop + JSONL protocol, used by every model
-// worker (the generic text_llm_worker and model-specific workers like
-// qwen3_5_moe_worker). A worker only constructs its engine/tokenizer and calls
-// run_worker_stdio_loop(); the protocol, session management, and the decode
-// loop live here once, so protocol changes land in a single place.
+// Shared model-worker generation loop + JSONL protocol for every model worker
+// (the generic text_llm_worker and model-specific workers like
+// qwen3_5_moe_worker): a worker constructs its engine + tokenizer and calls
+// run_worker_stdio_loop(); the protocol, session routing, and decode loop live
+// here once.
 //
-// V2a (isolation): the worker owns one LLMEngine (weights loaded once) and
-// hands out multiple isolated LLMSessions keyed by session_id, each with its
-// own KV/recurrent state, up to the engine's serving capacity. Execution is
-// synchronous -- one in-flight request at a time, the control plane serializes.
+// The worker owns one LLMEngine (weights loaded once) and serves multiple
+// isolated LLMSessions keyed by session_id, up to the engine's serving
+// capacity; anonymous requests (no session_id) share one scratch session that
+// is reset every request. Execution is synchronous: one in-flight request at a
+// time.
 //
-// V2b.1 (warm append-only resume): a named session keeps its decoded context
-// across requests. On the next request the worker compares the new prompt's
-// token ids against the session's resident token ids; if the resident ids are
-// an exact prefix, it prefills ONLY the suffix (continuing the KV/recurrent
-// state at pos>0) instead of resetting and re-prefilling the whole prompt. The
-// check is exact-token (never string/retokenized text) and falls back to a full
-// reset+prefill whenever exact reuse can't be proven, so it is always correct;
-// the win is when the prompt is a genuine token extension of the prior turn.
+// Warm resume: a named session keeps its decoded context across requests. The
+// new prompt's token ids are matched against the session's resident token ids;
+// on an exact prefix only the suffix is prefilled (continuing at pos>0). The
+// match is exact-token (never retokenized text) and falls back to a full
+// reset+prefill whenever exact reuse can't be proven, so it is always correct.
 // See plan_prefill().
 //
-// Sessions:
-//   - Named: an explicit session_id -> session + resident token ids, created on
-//     first use (or via an `open` op), capped at max_named_sessions = capacity
-//     - 1 (the scratch slot is reserved). 0 when the backend hosts one session.
-//     Warm resume applies to named sessions (unless disabled).
-//   - Scratch: one session for anonymous requests (no session_id), reset every
-//     request -- distinct anonymous callers must never reuse each other's
-//     state.
-//
-// Protocol (one JSON object per line; matches worker_client.py):
+// Protocol (one JSON object per line; matches worker_client.py). stdout carries
+// ONLY protocol JSON; logs go to stderr (ET_LOG):
 //   worker -> stdout, once:    {"ready": true, "max_sessions": int,
 //                               "max_named_sessions": int}
 //   client -> stdin:
-//     generate:   {"max_new_tokens": int, "temperature": float,
-//                  "stop": [str, ...], "session_id"?: str,
-//                  and exactly one prompt form:
-//                    "prompt": str
-//                    "prompt_segments": [{"text": str} | {"ids": [int, ...]}]}
-//     open:       {"op": "open",  "session_id": str}
-//     close:      {"op": "close", "session_id": str}
-//     reset:      {"op": "reset", "session_id": str}  // clear context, keep
-//     slot
+//     generate: {"max_new_tokens": int, "temperature": float, "stop":
+//     [str,...],
+//                "session_id"?: str, and exactly one prompt form:
+//                  "prompt": str
+//                  "prompt_segments": [{"text": str} | {"ids": [int,...]}]}
+//     open/close/reset: {"op": "open"|"close"|"reset", "session_id": str}
 //   worker -> stdout:
-//     generate:   {"token": str} *   (streamed)
-//                 {"done": true, "prompt_tokens": int, "completion_tokens":
-//                 int,
-//                  "finish_reason": "stop"|"length",
-//                  "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
-//                  "session_reset_reason": "new"|"exact_prefix"|"dirty"|
-//                                          "mismatch"|"equal",
-//                  "generated_token_ids"?: [int, ...]}  // omitted if
-//                  stop-trimmed
-//     open:       {"opened": true, "session_id": str}
-//     close:      {"closed": true, "session_id": str}
-//     reset:      {"reset": true,  "session_id": str}
-//     error:      {"error": str, "code"?: str}  // code: "capacity_exhausted",
-//                                               // "unsupported_session"
-//
-// stdout carries ONLY protocol JSON; all logs go to stderr (ET_LOG). One
-// request at a time (the control plane serializes).
+//     generate: {"token": str} *  (streamed), then
+//               {"done": true, "prompt_tokens": int, "completion_tokens": int,
+//                "finish_reason": "stop"|"length",
+//                "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
+//                "session_reset_reason": str
+//                (new|exact_prefix|mismatch|dirty|equal),
+//                "generated_token_ids"?: [int,...]}  // omitted if stop-trimmed
+//     open/close/reset: {"opened"|"closed"|"reset": true, "session_id": str}
+//     error:    {"error": str, "code"?: str}  // capacity_exhausted |
+//                                              // unsupported_session
 
 #include <nlohmann/json.hpp>
 
@@ -242,7 +222,12 @@ inline void worker_handle_request(
     const auto& d = step_result.get();
     if (d.is_terminal) {
       finish = "stop";
-      break; // terminal step (EOS / cooperative stop): not emitted or counted
+      // Terminal step (EOS / cooperative stop): the terminal token is neither
+      // emitted as text nor counted in num_generated -> completion_tokens. This
+      // is intentional -- completion_tokens reflects the visible completion the
+      // client received, not internal forward steps; an EOS the user never sees
+      // is not part of that count.
+      break;
     }
     // The token was forwarded into the cache (pos advanced); track it so the
     // resident-ids/position invariant holds. EOS/terminal tokens are not
@@ -264,9 +249,15 @@ inline void worker_handle_request(
     if (stop_hit) {
       finish = "stop"; // reached a stop string: drop it and everything after
       stop_string = true;
-      // The emitted text was trimmed at the stop string, so the next turn's
-      // rendered prompt won't be an exact token extension of resident: force a
-      // reset rather than risk a false prefix match.
+      // Trimming at the stop means the next turn's prompt won't be an exact
+      // token extension of resident, so force a reset (no false prefix match).
+      //
+      // CONTRACT: every *string* stop is non-resumable this way (trim + dirty +
+      // omit generated_token_ids) -- right for user/request and content-cleanup
+      // stops, which change visible text. A clean turn terminator stays
+      // warm-resumable only if the engine surfaces it as a terminal/EOS token
+      // id (handled above via d.is_terminal; e.g. Qwen adds <|im_end|> to
+      // eos_ids).
       st.dirty = true;
       break;
     }
@@ -400,8 +391,8 @@ class WorkerSessions {
 // Emit {"ready": true, ...}, then read JSONL requests from stdin and dispatch
 // each (generate / open / close / reset), reporting exceptions as
 // {"error": ...} and continuing to serve. Returns 0 when stdin closes.
-// enable_warm_resume gates V2b.1 warm suffix reuse for named sessions (off ->
-// every request resets, the V2a behavior; useful for A/B measurement).
+// enable_warm_resume gates warm suffix reuse for named sessions (off -> every
+// request resets and re-prefills; useful for A/B measurement).
 inline int run_worker_stdio_loop(
     LLMEngine& engine,
     ::tokenizers::Tokenizer& tokenizer,
Original file line number	Diff line number	Diff line change
`@@ -398,7 +398,7 @@ void mutable_state_set_active(MutableStateContext ctx, int token) {`
`398`	`398`	`void mutable_state_note_handle(CudaDelegateHandle* handle) {`
`399`	`399`	`MutableStateContext ctx = tl_loading_ctx;`
`400`	`400`	`if (ctx == kInvalidMutableContext) {`
`401`		`- return; // not loading within a managed context (e.g. non-V2 path)`
	`401`	`+ return; // not loading within a managed context (single-session path)`
`402`	`402`	`}`
`403`	`403`	`auto& m = mgr();`
`404`	`404`	`std::lock_guard<std::mutex> g(m.mu);`