Skip to content

Commit a6f5d73

Browse files
committed
[UPDATE] Update
[ghstack-poisoned]
2 parents 394b0c1 + ab7c8bc commit a6f5d73

19 files changed

Lines changed: 690 additions & 453 deletions

Makefile

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-cuda-serve qwen3_5_moe-metal clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda gemma4_31b-mlx qwen3_5_moe-cuda qwen3_5_moe-metal clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -129,8 +129,7 @@ help:
129129
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
130130
@echo " gemma4_31b-cuda - Build Gemma 4 31B runner with CUDA backend"
131131
@echo " gemma4_31b-mlx - Build Gemma 4 31B runner with MLX backend"
132-
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner with CUDA backend"
133-
@echo " qwen3_5_moe-cuda-serve - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
132+
@echo " qwen3_5_moe-cuda - Build Qwen3.5 MoE runner + OpenAI serving worker (CUDA)"
134133
@echo " qwen3_5_moe-metal - Build Qwen3.5 MoE runner with Metal backend"
135134
@echo " clean - Clean build artifacts"
136135

@@ -432,11 +431,13 @@ voxtral_tts-cuda:
432431
qwen3_5_moe-cuda:
433432
@echo "==> Building and installing ExecuTorch with CUDA..."
434433
cmake --workflow --preset llm-release-cuda
435-
@echo "==> Building Qwen3.5 MoE runner with CUDA..."
434+
@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
436435
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
437436
@echo ""
438437
@echo "✓ Build complete!"
439438
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
439+
@echo " Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
440+
@echo " Launch: see examples/models/qwen3_5_moe/README.md (Serving)"
440441

441442
gemma4_31b-cuda:
442443
@echo "==> Building and installing ExecuTorch with CUDA..."
@@ -456,17 +457,6 @@ gemma4_31b-mlx:
456457
@echo "✓ Build complete!"
457458
@echo " Binary: cmake-out/examples/models/gemma4_31b/gemma4_31b_runner"
458459

459-
qwen3_5_moe-cuda-serve:
460-
@echo "==> Building and installing ExecuTorch with CUDA..."
461-
cmake --workflow --preset llm-release-cuda
462-
@echo "==> Building Qwen3.5 MoE runner + serving worker with CUDA..."
463-
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda-serve
464-
@echo ""
465-
@echo "✓ Build complete!"
466-
@echo " Binary: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner"
467-
@echo " Serving worker: cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
468-
@echo " Launch: see examples/models/qwen3_5_moe/README.md (Serving)"
469-
470460
qwen3_5_moe-metal:
471461
@echo "==> Building and installing ExecuTorch with Metal..."
472462
cmake --workflow --preset llm-release-metal

backends/cuda/runtime/cuda_mutable_state.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ Error build_descriptors(Context& c, CudaDelegateHandle* h) {
163163
tpl, t->data_ptr(), t->nbytes(), cudaMemcpyDeviceToDevice) !=
164164
cudaSuccess) {
165165
ET_LOG(Error, "mutable_state: cudaMemcpy template '%s'", fqn.c_str());
166+
cudaFree(tpl);
166167
return Error::Internal;
167168
}
168169
c.template_ptr[fqn] = tpl;
@@ -194,6 +195,7 @@ Error ensure_session_buffers(Context& c, int token) {
194195
if (cudaMemcpy(p, tpl, nbytes, cudaMemcpyDeviceToDevice) != cudaSuccess) {
195196
ET_LOG(
196197
Error, "mutable_state: cudaMemcpy session buffer '%s'", fqn.c_str());
198+
cudaFree(p);
197199
return Error::Internal;
198200
}
199201
buf[fqn] = p;

examples/models/qwen3_5_moe/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,8 @@ endif()
7878

7979
# Process-isolated serving worker (qwen3_5_moe_worker): constructs
8080
# Qwen35MoEEngine directly and speaks the JSONL worker protocol that the Python
81-
# control plane drives via WorkerClient (no pybind, no Python model code). Used
82-
# by the qwen3_5_moe-cuda-serve flow.
81+
# control plane drives via WorkerClient (no pybind, no Python model code). Built
82+
# alongside the runner by the qwen3-5-moe-cuda preset.
8383
add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
8484
target_include_directories(
8585
qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}

examples/models/qwen3_5_moe/CMakePresets.json

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
},
1414
{
1515
"name": "qwen3-5-moe-cuda",
16-
"displayName": "Qwen3.5 MoE runner (CUDA)",
16+
"displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
1717
"inherits": ["qwen3-5-moe-base"],
1818
"cacheVariables": {
1919
"EXECUTORCH_BUILD_CUDA": "ON"
@@ -24,11 +24,6 @@
2424
"list": ["Linux", "Windows"]
2525
}
2626
},
27-
{
28-
"name": "qwen3-5-moe-cuda-serve",
29-
"displayName": "Qwen3.5 MoE runner + serving worker (CUDA)",
30-
"inherits": ["qwen3-5-moe-cuda"]
31-
},
3227
{
3328
"name": "qwen3-5-moe-metal",
3429
"displayName": "Qwen3.5 MoE runner (Metal)",
@@ -46,14 +41,8 @@
4641
"buildPresets": [
4742
{
4843
"name": "qwen3-5-moe-cuda",
49-
"displayName": "Build Qwen3.5 MoE runner (CUDA)",
50-
"configurePreset": "qwen3-5-moe-cuda",
51-
"targets": ["qwen3_5_moe_runner"]
52-
},
53-
{
54-
"name": "qwen3-5-moe-cuda-serve",
5544
"displayName": "Build Qwen3.5 MoE runner + serving worker (CUDA)",
56-
"configurePreset": "qwen3-5-moe-cuda-serve",
45+
"configurePreset": "qwen3-5-moe-cuda",
5746
"targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
5847
},
5948
{
@@ -66,7 +55,7 @@
6655
"workflowPresets": [
6756
{
6857
"name": "qwen3-5-moe-cuda",
69-
"displayName": "Configure and build Qwen3.5 MoE runner (CUDA)",
58+
"displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
7059
"steps": [
7160
{
7261
"type": "configure",
@@ -78,20 +67,6 @@
7867
}
7968
]
8069
},
81-
{
82-
"name": "qwen3-5-moe-cuda-serve",
83-
"displayName": "Configure and build Qwen3.5 MoE runner + serving worker (CUDA)",
84-
"steps": [
85-
{
86-
"type": "configure",
87-
"name": "qwen3-5-moe-cuda-serve"
88-
},
89-
{
90-
"type": "build",
91-
"name": "qwen3-5-moe-cuda-serve"
92-
}
93-
]
94-
},
9570
{
9671
"name": "qwen3-5-moe-metal",
9772
"displayName": "Configure and build Qwen3.5 MoE runner (Metal)",

examples/models/qwen3_5_moe/README.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,14 +100,16 @@ It can be uploaded to HuggingFace Hub for easy sharing.
100100

101101
ExecuTorch must be installed from source first (see
102102
[Prerequisites](#prerequisites)). The `make` target handles building
103-
core libraries and the runner binary.
103+
core libraries and the binaries.
104104

105105
```bash
106106
make qwen3_5_moe-cuda
107107
```
108108

109109
This builds ExecuTorch with CUDA backend support, then the runner binary
110-
at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner`.
110+
at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner` and the
111+
serving worker at `cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker`
112+
(see [Serving](#serving-openai-compatible)).
111113

112114
## Run
113115

@@ -144,10 +146,10 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
144146
Run an OpenAI-compatible HTTP server so an agent harness (pi, opencode, …) can
145147
use the model for local tool-use. Point your client at `http://<host>:<port>/v1`.
146148

147-
Build the runner **and** the serving worker:
149+
The CUDA build produces the runner **and** the serving worker:
148150

149151
```bash
150-
make qwen3_5_moe-cuda-serve
152+
make qwen3_5_moe-cuda
151153
```
152154

153155
Launch (the `LD_LIBRARY_PATH` shim is forwarded to the worker for the CUDA blob):

examples/models/qwen3_5_moe/model.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,9 @@ any harness) drive the model without knowing it is Qwen-MoE or CUDA.
151151
that shares that one model but owns its own per-session mutable state
152152
(KV/conv/recurrent), rebound before execute under the engine lock.
153153
`serving_capacity()` reports how many such sessions fit without duplicating
154-
weights (or 1 if the backend can't rebind). The serving path is still
155-
single-slot until the worker exposes multi-session.
154+
weights (or 1 if the backend can't rebind). The serving worker exposes this
155+
over its protocol: requests route by session_id and warm-resume their context
156+
across turns, serialized to one in-flight request.
156157
- **`Qwen35MoESession`** owns the mutable conversation state (KV / conv /
157158
recurrent arenas via the Module, position cursor, pending token).
158159
`prefill_tokens` dispatches to `prefill` (T≥2) or `decode` (T==1);

examples/models/qwen3_5_moe/qwen35_moe_engine.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@
2525
// rebind. The per-session rebind machinery is CUDA-backend-private (see
2626
// backends/cuda/runtime/cuda_mutable_state).
2727
//
28-
// The SERVING path (qwen3_5_moe_worker + control plane) is still single-slot:
29-
// it creates one session and queues requests on it. Exposing the engine's
30-
// multi-session capability over the worker protocol is a follow-up.
28+
// The SERVING path (qwen3_5_moe_worker + control plane) exposes this over the
29+
// worker protocol: the worker routes requests to per-session_id state (V2a) and
30+
// reuses each session's resident context across requests (warm append-only
31+
// resume, V2b.1). Execution stays serialized (one in-flight request).
3132

3233
#pragma once
3334

examples/models/qwen3_5_moe/serve.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@
4040
from pathlib import Path
4141

4242
from executorch.extension.llm.server.python.chat_template import ChatTemplate
43-
from executorch.extension.llm.server.python.runner_pool import RunnerPool
4443
from executorch.extension.llm.server.python.serving_chat import ServingChat
44+
from executorch.extension.llm.server.python.session_runtime import SessionRuntime
4545
from executorch.extension.llm.server.python.tool_parsers import QwenFunctionCallDetector
4646
from executorch.extension.llm.server.python.worker_client import spawn_worker
4747

@@ -89,9 +89,9 @@ def build_app_from_args(args):
8989
)
9090

9191
worker = _spawn(args) # one worker == one session (single-slot V1)
92-
pool = RunnerPool([worker])
92+
runtime = SessionRuntime(worker)
9393
serving = ServingChat(
94-
pool,
94+
runtime,
9595
template,
9696
args.model_id,
9797
max_context=args.max_context,
@@ -105,7 +105,7 @@ def build_app_from_args(args):
105105

106106
@app.on_event("shutdown")
107107
def _stop_worker():
108-
pool.close()
108+
runtime.close_worker()
109109

110110
return app, args.model_id
111111

extension/llm/server/README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@ Hugging Face chat templates (`--hf-tokenizer`), `temperature` / `max_tokens` /
2525
`max_completion_tokens` / `stop`, Hermes tool calling by default
2626
(`<tool_call>...</tool_call>` JSON, complete calls only; model-specific launchers
2727
may select the Qwen XML format) with `tool_choice="none"`,
28-
structured API errors, and best-effort cancellation. V1 serving is single-slot
29-
(one worker, one session) with no prefix cache; KV prefix reuse, if it returns,
30-
lives inside the worker/session, not the control plane. Unsupported params (including `top_p`,
28+
structured API errors, and best-effort cancellation. One worker process with
29+
serialized execution; it hosts many isolated sessions on one weight load (warm
30+
append-only resume across turns). KV/prefix state lives inside the
31+
worker/session, not the control plane. Unsupported params (including `top_p`,
3132
`seed`, `n>1`, `reasoning_effort`, penalties, `logit_bias`, `response_format`,
3233
`logprobs`, and `tool_choice="required"`) are rejected with a structured 400
3334
rather than silently ignored. See `python/README.md` to run it and

extension/llm/server/python/README.md

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ Key flags:
6969
| `--allow-chatml-fallback` | opt into approximate ChatML when no HF tokenizer |
7070
| `--no-think` | default `enable_thinking=False` (e.g. Qwen3) |
7171
| `--max-context N` | reject over-long prompts with 400 instead of failing mid-gen |
72-
| `--num-runners N` | V1 supports **1 only** (single-slot: one worker serves one session; concurrent requests queue) |
72+
| `--num-runners N` | Worker processes — **1 only** (one worker hosts many isolated sessions on one weight load; more would duplicate weights) |
7373
| `--worker-bin PATH` | path to the `text_llm_worker` binary (default: `cmake-out/extension/llm/server/cpp/text_llm_worker`) |
7474

7575
## Use from an agent harness
@@ -101,16 +101,19 @@ pytest tests/
101101
OPENAI_BASE_URL=http://127.0.0.1:8000/v1 pytest ../conformance/test_openai_contract.py
102102
```
103103

104-
`tests/` builds a `RunnerPool` over a single `FakeRunner` worker handle, so the
104+
`tests/` builds a `SessionRuntime` over a single `FakeRunner` worker, so the
105105
real server/protocol/streaming code is tested over HTTP without a `.pte`. The
106106
worker JSONL protocol is covered separately by `tests/test_worker_client.py`.
107107

108108
## Architecture
109109

110-
Control plane (this dir, Python): server, OpenAI protocol, chat templating,
111-
streaming bridge, tool parsing — no CUDA, no model, no pybind. Data plane (C++):
112-
a worker process (`text_llm_worker`) owns one model session and does all token
113-
stepping and KV mutation; it speaks one JSON object per line on stdin/stdout.
110+
Control plane (this dir, Python): an OpenAI adapter (`serving_chat`) over a
111+
stateful `SessionRuntime` over one `WorkerClient` — server, protocol, chat
112+
templating, streaming bridge, tool parsing — no CUDA, no model, no pybind. Data
113+
plane (C++): a worker process (`text_llm_worker`) that owns all model state
114+
(many isolated sessions on one weight load, warm-resume prefix logic) and does
115+
all token stepping and KV mutation; it speaks one JSON object per line on
116+
stdin/stdout.
114117

115118
JSONL protocol (stdout carries protocol JSON only; logs go to stderr):
116119

@@ -132,9 +135,9 @@ does blocking pipe I/O on its executor thread.
132135
| `server.py` | FastAPI app, routes, CLI entrypoint, worker spawn |
133136
| `protocol.py` | OpenAI request/response schemas |
134137
| `chat_template.py` | messages (+tools) → prompt string |
135-
| `worker_client.py` | spawn a worker process + drive it over JSONL |
136-
| `runner_pool.py` | worker pool (one in-flight request per worker) + async streaming bridge |
137-
| `serving_chat.py` | `/v1/chat/completions` (streaming + non-streaming, stop, tools) |
138+
| `worker_client.py` | spawn a worker process + drive it over JSONL (raw transport) |
139+
| `session_runtime.py` | stateful runtime over one worker: open/generate/reset/close + streaming bridge |
140+
| `serving_chat.py` | `/v1/chat/completions` OpenAI adapter (streaming + non-streaming, stop, tools) |
138141
| `tool_parsers/` | Hermes/Qwen `<tool_call>` parser only |
139142
| `cpp/text_llm_worker.cpp` | the generic C++ worker binary |
140143

@@ -151,11 +154,11 @@ imports an example. Backend specifics (CUDA/AOTI, Metal) stay inside the worker.
151154
## Scope & caveats
152155

153156
Deliberately narrow (reliability-first): Hermes/Qwen tool calling only;
154-
unsupported sampling params are rejected, not ignored. V1 is **single-slot**: one
155-
worker hosts one session, so `--num-runners` accepts 1 and concurrent requests
156-
queue. Serving capacity is worker capacity, chosen by the launcher (each worker
157-
is its own process with its own weights, so N workers cost N × the weight memory)
158-
— an operator decision, not something the pool infers.
157+
unsupported sampling params are rejected, not ignored. **One worker process,
158+
serialized execution** (one in-flight request; concurrent requests queue).
159+
Session capacity is determined by the worker/engine — a single worker hosts many
160+
isolated sessions on one weight load — so `--num-runners` accepts 1; extra worker
161+
processes would each carry their own copy of the weights.
159162

160163
Cancellation is best-effort: a worker request runs to completion and is not
161164
interruptible mid-generation in V1, so `runner.stop()` means "the control plane

0 commit comments

Comments
 (0)