pytorch
diff --git a/‎Makefile‎
Lines changed: 5 additions & 15 deletions b/‎Makefile‎
Lines changed: 5 additions & 15 deletions
diff --git a/‎backends/cuda/runtime/cuda_mutable_state.cpp‎
Lines changed: 2 additions & 0 deletions b/‎backends/cuda/runtime/cuda_mutable_state.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/models/qwen3_5_moe/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎examples/models/qwen3_5_moe/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/models/qwen3_5_moe/CMakePresets.json‎
Lines changed: 3 additions & 28 deletions b/‎examples/models/qwen3_5_moe/CMakePresets.json‎
Lines changed: 3 additions & 28 deletions
diff --git a/‎examples/models/qwen3_5_moe/README.md‎
Lines changed: 6 additions & 4 deletions b/‎examples/models/qwen3_5_moe/README.md‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/models/qwen3_5_moe/model.md‎
Lines changed: 3 additions & 2 deletions b/‎examples/models/qwen3_5_moe/model.md‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/models/qwen3_5_moe/qwen35_moe_engine.h‎
Lines changed: 4 additions & 3 deletions b/‎examples/models/qwen3_5_moe/qwen35_moe_engine.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/models/qwen3_5_moe/serve.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/models/qwen3_5_moe/serve.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎extension/llm/server/README.md‎
Lines changed: 4 additions & 3 deletions b/‎extension/llm/server/README.md‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎extension/llm/server/python/README.md‎
Lines changed: 17 additions & 14 deletions b/‎extension/llm/server/python/README.md‎
Lines changed: 17 additions & 14 deletions
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-cuda-serve qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -129,8 +129,7 @@ help:
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
 	@echo "  gemma4_31b-cuda     - Build Gemma 4 31B runner with CUDA backend"
 	@echo "  gemma4_31b-mlx      - Build Gemma 4 31B runner with MLX backend"
-	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner with CUDA backend"
-	@echo "  qwen3_5_moe-cuda-serve - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
+	@echo "  qwen3_5_moe-cuda    - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
 	@echo "  qwen3_5_moe-metal   - Build Qwen3.5 MoE runner with Metal backend"
 	@echo "  clean               - Clean build artifacts"
 
@@ -432,11 +431,13 @@ voxtral_tts-cuda:
 qwen3_5_moe-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
 	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Qwen3.5 MoE runner with CUDA..."
+	@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
 	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
 	@echo ""
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
+	@echo "  Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
+	@echo "  Launch: see examples/models/qwen3_5_moe/README.md (Serving)"
 
 gemma4_31b-cuda:
 	@echo "==> Building and installing ExecuTorch with CUDA..."
@@ -456,17 +457,6 @@ gemma4_31b-mlx:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
 
-qwen3_5_moe-cuda-serve:
-	@echo "==> Building and installing ExecuTorch with CUDA..."
-	cmake --workflow --preset llm-release-cuda
-	@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
-	cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda-serve
-	@echo ""
-	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
-	@echo "  Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
-	@echo "  Launch: see examples/models/qwen3_5_moe/README.md (Serving)"
-
 qwen3_5_moe-metal:
 	@echo "==> Building and installing ExecuTorch with Metal..."
 	cmake --workflow --preset llm-release-metal
 
@@ -163,6 +163,7 @@ Error build_descriptors(Context& c, CudaDelegateHandle* h) {
               tpl, t->data_ptr(), t->nbytes(), cudaMemcpyDeviceToDevice) !=
           cudaSuccess) {
         ET_LOG(Error, "mutable_state: cudaMemcpy template '%s'", fqn.c_str());
+        cudaFree(tpl);
         return Error::Internal;
       }
       c.template_ptr[fqn] = tpl;
@@ -194,6 +195,7 @@ Error ensure_session_buffers(Context& c, int token) {
     if (cudaMemcpy(p, tpl, nbytes, cudaMemcpyDeviceToDevice) != cudaSuccess) {
       ET_LOG(
           Error, "mutable_state: cudaMemcpy session buffer '%s'", fqn.c_str());
+      cudaFree(p);
       return Error::Internal;
     }
     buf[fqn] = p;
 
@@ -78,8 +78,8 @@ endif()
 
 # Process-isolated serving worker (qwen3_5_moe_worker): constructs
 # Qwen35MoEEngine directly and speaks the JSONL worker protocol that the Python
-# control plane drives via WorkerClient (no pybind, no Python model code). Used
-# by the qwen3_5_moe-cuda-serve flow.
+# control plane drives via WorkerClient (no pybind, no Python model code). Built
+# alongside the runner by the qwen3-5-moe-cuda preset.
 add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
 target_include_directories(
   qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
 
@@ -13,7 +13,7 @@
         },
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Qwen3.5 MoE runner (CUDA)",
+            "displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
             "inherits": ["qwen3-5-moe-base"],
             "cacheVariables": {
                 "EXECUTORCH_BUILD_CUDA": "ON"
@@ -24,11 +24,6 @@
                 "list": ["Linux", "Windows"]
             }
         },
-        {
-            "name": "qwen3-5-moe-cuda-serve",
-            "displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
-            "inherits": ["qwen3-5-moe-cuda"]
-        },
         {
             "name": "qwen3-5-moe-metal",
             "displayName": "Qwen3.5 MoE runner (Metal)",
@@ -46,14 +41,8 @@
     "buildPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Build Qwen3.5 MoE runner (CUDA)",
-            "configurePreset": "qwen3-5-moe-cuda",
-            "targets": ["qwen3_5_moe_runner"]
-        },
-        {
-            "name": "qwen3-5-moe-cuda-serve",
             "displayName": "Build Qwen3.5 MoE runner + serving worker (CUDA)",
-            "configurePreset": "qwen3-5-moe-cuda-serve",
+            "configurePreset": "qwen3-5-moe-cuda",
             "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
         },
         {
@@ -66,7 +55,7 @@
     "workflowPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Configure and build Qwen3.5 MoE runner (CUDA)",
+            "displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
             "steps": [
                 {
                     "type": "configure",
@@ -78,20 +67,6 @@
                 }
             ]
         },
-        {
-            "name": "qwen3-5-moe-cuda-serve",
-            "displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
-            "steps": [
-                {
-                    "type": "configure",
-                    "name": "qwen3-5-moe-cuda-serve"
-                },
-                {
-                    "type": "build",
-                    "name": "qwen3-5-moe-cuda-serve"
-                }
-            ]
-        },
         {
             "name": "qwen3-5-moe-metal",
             "displayName": "Configure and build Qwen3.5 MoE runner (Metal)",
 
@@ -100,14 +100,16 @@ It can be uploaded to HuggingFace Hub for easy sharing.
 
 ExecuTorch must be installed from source first (see
 [Prerequisites](#prerequisites)). The `make` target handles building
-core libraries and the runner binary.
+core libraries and the binaries.
 
 ```bash
 make qwen3_5_moe-cuda
 ```
 
 This builds ExecuTorch with CUDA backend support, then the runner binary
-at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner`.
+at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` and the
+serving worker at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker`
+(see [Serving](#serving-openai-compatible)).
 
 ## Run
 
@@ -144,10 +146,10 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 Run an OpenAI-compatible HTTP server so an agent harness (pi, opencode, …) can
 use the model for local tool-use. Point your client at `http://<host>:<port>/v1`.
 
-Build the runner **and** the serving worker:
+The CUDA build produces the runner **and** the serving worker:
 
 ```bash
-make qwen3_5_moe-cuda-serve
+make qwen3_5_moe-cuda
 ```
 
 Launch (the `LD_LIBRARY_PATH` shim is forwarded to the worker for the CUDA blob):
 
@@ -151,8 +151,9 @@ any harness) drive the model without knowing it is Qwen-MoE or CUDA.
   that shares that one model but owns its own per-session mutable state
   (KV/conv/recurrent), rebound before execute under the engine lock.
   `serving_capacity()` reports how many such sessions fit without duplicating
-  weights (or 1 if the backend can't rebind). The serving path is still
-  single-slot until the worker exposes multi-session.
+  weights (or 1 if the backend can't rebind). The serving worker exposes this
+  over its protocol: requests route by session_id and warm-resume their context
+  across turns, serialized to one in-flight request.
 - **`Qwen35MoESession`** owns the mutable conversation state (KV / conv /
   recurrent arenas via the Module, position cursor, pending token).
   `prefill_tokens` dispatches to `prefill` (T≥2) or `decode` (T==1);
 
@@ -25,9 +25,10 @@
 // rebind. The per-session rebind machinery is CUDA-backend-private (see
 // backends/cuda/runtime/cuda_mutable_state).
 //
-// The SERVING path (qwen3_5_moe_worker + control plane) is still single-slot:
-// it creates one session and queues requests on it. Exposing the engine's
-// multi-session capability over the worker protocol is a follow-up.
+// The SERVING path (qwen3_5_moe_worker + control plane) exposes this over the
+// worker protocol: the worker routes requests to per-session_id state (V2a) and
+// reuses each session's resident context across requests (warm append-only
+// resume, V2b.1). Execution stays serialized (one in-flight request).
 
 #pragma once
 
 
@@ -40,8 +40,8 @@
 from pathlib import Path
 
 from executorch.extension.llm.server.python.chat_template import ChatTemplate
-from executorch.extension.llm.server.python.runner_pool import RunnerPool
 from executorch.extension.llm.server.python.serving_chat import ServingChat
+from executorch.extension.llm.server.python.session_runtime import SessionRuntime
 from executorch.extension.llm.server.python.tool_parsers import QwenFunctionCallDetector
 from executorch.extension.llm.server.python.worker_client import spawn_worker
 
@@ -89,9 +89,9 @@ def build_app_from_args(args):
     )
 
     worker = _spawn(args)  # one worker == one session (single-slot V1)
-    pool = RunnerPool([worker])
+    runtime = SessionRuntime(worker)
     serving = ServingChat(
-        pool,
+        runtime,
         template,
         args.model_id,
         max_context=args.max_context,
@@ -105,7 +105,7 @@ def build_app_from_args(args):
 
     @app.on_event("shutdown")
     def _stop_worker():
-        pool.close()
+        runtime.close_worker()
 
     return app, args.model_id
 
 
@@ -25,9 +25,10 @@ Hugging Face chat templates (`--hf-tokenizer`), `temperature` / `max_tokens` /
 `max_completion_tokens` / `stop`, Hermes tool calling by default
 (`<tool_call>...</tool_call>` JSON, complete calls only; model-specific launchers
 may select the Qwen XML format) with `tool_choice="none"`,
-structured API errors, and best-effort cancellation. V1 serving is single-slot
-(one worker, one session) with no prefix cache; KV prefix reuse, if it returns,
-lives inside the worker/session, not the control plane. Unsupported params (including `top_p`,
+structured API errors, and best-effort cancellation. One worker process with
+serialized execution; it hosts many isolated sessions on one weight load (warm
+append-only resume across turns). KV/prefix state lives inside the
+worker/session, not the control plane. Unsupported params (including `top_p`,
 `seed`, `n>1`, `reasoning_effort`, penalties, `logit_bias`, `response_format`,
 `logprobs`, and `tool_choice="required"`) are rejected with a structured 400
 rather than silently ignored. See `python/README.md` to run it and
 
@@ -69,7 +69,7 @@ Key flags:
 | `--allow-chatml-fallback` | opt into approximate ChatML when no HF tokenizer |
 | `--no-think` | default `enable_thinking=False` (e.g. Qwen3) |
 | `--max-context N` | reject over-long prompts with 400 instead of failing mid-gen |
-| `--num-runners N` | V1 supports **1 only** (single-slot: one worker serves one session; concurrent requests queue) |
+| `--num-runners N` | Worker processes — **1 only** (one worker hosts many isolated sessions on one weight load; more would duplicate weights) |
 | `--worker-bin PATH` | path to the `text_llm_worker` binary (default: `cmake-out/extension/llm/server/cpp/text_llm_worker`) |
 
 ## Use from an agent harness
@@ -101,16 +101,19 @@ pytest tests/
 OPENAI_BASE_URL=http://127.0.0.1:8000/v1 pytest ../conformance/test_openai_contract.py
 ```
 
-`tests/` builds a `RunnerPool` over a single `FakeRunner` worker handle, so the
+`tests/` builds a `SessionRuntime` over a single `FakeRunner` worker, so the
 real server/protocol/streaming code is tested over HTTP without a `.pte`. The
 worker JSONL protocol is covered separately by `tests/test_worker_client.py`.
 
 ## Architecture
 
-Control plane (this dir, Python): server, OpenAI protocol, chat templating,
-streaming bridge, tool parsing — no CUDA, no model, no pybind. Data plane (C++):
-a worker process (`text_llm_worker`) owns one model session and does all token
-stepping and KV mutation; it speaks one JSON object per line on stdin/stdout.
+Control plane (this dir, Python): an OpenAI adapter (`serving_chat`) over a
+stateful `SessionRuntime` over one `WorkerClient` — server, protocol, chat
+templating, streaming bridge, tool parsing — no CUDA, no model, no pybind. Data
+plane (C++): a worker process (`text_llm_worker`) that owns all model state
+(many isolated sessions on one weight load, warm-resume prefix logic) and does
+all token stepping and KV mutation; it speaks one JSON object per line on
+stdin/stdout.
 
 JSONL protocol (stdout carries protocol JSON only; logs go to stderr):
 
@@ -132,9 +135,9 @@ does blocking pipe I/O on its executor thread.
 | `server.py` | FastAPI app, routes, CLI entrypoint, worker spawn |
 | `protocol.py` | OpenAI request/response schemas |
 | `chat_template.py` | messages (+tools) → prompt string |
-| `worker_client.py` | spawn a worker process + drive it over JSONL |
-| `runner_pool.py` | worker pool (one in-flight request per worker) + async streaming bridge |
-| `serving_chat.py` | `/v1/chat/completions` (streaming + non-streaming, stop, tools) |
+| `worker_client.py` | spawn a worker process + drive it over JSONL (raw transport) |
+| `session_runtime.py` | stateful runtime over one worker: open/generate/reset/close + streaming bridge |
+| `serving_chat.py` | `/v1/chat/completions` OpenAI adapter (streaming + non-streaming, stop, tools) |
 | `tool_parsers/` | Hermes/Qwen `<tool_call>` parser only |
 | `cpp/text_llm_worker.cpp` | the generic C++ worker binary |
 
@@ -151,11 +154,11 @@ imports an example. Backend specifics (CUDA/AOTI, Metal) stay inside the worker.
 ## Scope & caveats
 
 Deliberately narrow (reliability-first): Hermes/Qwen tool calling only;
-unsupported sampling params are rejected, not ignored. V1 is **single-slot**: one
-worker hosts one session, so `--num-runners` accepts 1 and concurrent requests
-queue. Serving capacity is worker capacity, chosen by the launcher (each worker
-is its own process with its own weights, so N workers cost N × the weight memory)
-— an operator decision, not something the pool infers.
+unsupported sampling params are rejected, not ignored. **One worker process,
+serialized execution** (one in-flight request; concurrent requests queue).
+Session capacity is determined by the worker/engine — a single worker hosts many
+isolated sessions on one weight load — so `--num-runners` accepts 1; extra worker
+processes would each carry their own copy of the weights.
 
 Cancellation is best-effort: a worker request runs to completion and is not
 interruptible mid-generation in V1, so `runner.stop()` means "the control plane