88
99#pragma once
1010
11- // Shared model-worker generation loop + JSONL protocol, used by every model
12- // worker (the generic text_llm_worker and model-specific workers like
13- // qwen3_5_moe_worker). A worker only constructs its engine/ tokenizer and calls
14- // run_worker_stdio_loop(); the protocol, session management , and the decode
15- // loop live here once, so protocol changes land in a single place .
11+ // Shared model-worker generation loop + JSONL protocol for every model worker
12+ // (the generic text_llm_worker and model-specific workers like
13+ // qwen3_5_moe_worker): a worker constructs its engine + tokenizer and calls
14+ // run_worker_stdio_loop(); the protocol, session routing , and decode loop live
15+ // here once.
1616//
17- // V2a (isolation): the worker owns one LLMEngine (weights loaded once) and
18- // hands out multiple isolated LLMSessions keyed by session_id, each with its
19- // own KV/recurrent state, up to the engine's serving capacity. Execution is
20- // synchronous -- one in-flight request at a time, the control plane serializes.
17+ // The worker owns one LLMEngine (weights loaded once) and serves multiple
18+ // isolated LLMSessions keyed by session_id, up to the engine's serving
19+ // capacity; anonymous requests (no session_id) share one scratch session that
20+ // is reset every request. Execution is synchronous: one in-flight request at a
21+ // time.
2122//
22- // V2b.1 (warm append-only resume): a named session keeps its decoded context
23- // across requests. On the next request the worker compares the new prompt's
24- // token ids against the session's resident token ids; if the resident ids are
25- // an exact prefix, it prefills ONLY the suffix (continuing the KV/recurrent
26- // state at pos>0) instead of resetting and re-prefilling the whole prompt. The
27- // check is exact-token (never string/retokenized text) and falls back to a full
28- // reset+prefill whenever exact reuse can't be proven, so it is always correct;
29- // the win is when the prompt is a genuine token extension of the prior turn.
23+ // Warm resume: a named session keeps its decoded context across requests. The
24+ // new prompt's token ids are matched against the session's resident token ids;
25+ // on an exact prefix only the suffix is prefilled (continuing at pos>0). The
26+ // match is exact-token (never retokenized text) and falls back to a full
27+ // reset+prefill whenever exact reuse can't be proven, so it is always correct.
3028// See plan_prefill().
3129//
32- // Sessions:
33- // - Named: an explicit session_id -> session + resident token ids, created on
34- // first use (or via an `open` op), capped at max_named_sessions = capacity
35- // - 1 (the scratch slot is reserved). 0 when the backend hosts one session.
36- // Warm resume applies to named sessions (unless disabled).
37- // - Scratch: one session for anonymous requests (no session_id), reset every
38- // request -- distinct anonymous callers must never reuse each other's
39- // state.
40- //
41- // Protocol (one JSON object per line; matches worker_client.py):
30+ // Protocol (one JSON object per line; matches worker_client.py). stdout carries
31+ // ONLY protocol JSON; logs go to stderr (ET_LOG):
4232// worker -> stdout, once: {"ready": true, "max_sessions": int,
4333// "max_named_sessions": int}
4434// client -> stdin:
45- // generate: {"max_new_tokens": int, "temperature": float,
46- // "stop": [str, ...], "session_id"?: str,
47- // and exactly one prompt form:
48- // "prompt": str
49- // "prompt_segments": [{"text": str} | {"ids": [int, ...]}]}
50- // open: {"op": "open", "session_id": str}
51- // close: {"op": "close", "session_id": str}
52- // reset: {"op": "reset", "session_id": str} // clear context, keep
53- // slot
35+ // generate: {"max_new_tokens": int, "temperature": float, "stop":
36+ // [str,...],
37+ // "session_id"?: str, and exactly one prompt form:
38+ // "prompt": str
39+ // "prompt_segments": [{"text": str} | {"ids": [int,...]}]}
40+ // open/close/reset: {"op": "open"|"close"|"reset", "session_id": str}
5441// worker -> stdout:
55- // generate: {"token": str} * (streamed)
56- // {"done": true, "prompt_tokens": int, "completion_tokens":
57- // int,
58- // "finish_reason": "stop"|"length",
59- // "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
60- // "session_reset_reason": "new"|"exact_prefix"|"dirty"|
61- // "mismatch"|"equal",
62- // "generated_token_ids"?: [int, ...]} // omitted if
63- // stop-trimmed
64- // open: {"opened": true, "session_id": str}
65- // close: {"closed": true, "session_id": str}
66- // reset: {"reset": true, "session_id": str}
67- // error: {"error": str, "code"?: str} // code: "capacity_exhausted",
68- // // "unsupported_session"
69- //
70- // stdout carries ONLY protocol JSON; all logs go to stderr (ET_LOG). One
71- // request at a time (the control plane serializes).
42+ // generate: {"token": str} * (streamed), then
43+ // {"done": true, "prompt_tokens": int, "completion_tokens": int,
44+ // "finish_reason": "stop"|"length",
45+ // "reused_prompt_tokens": int, "prefilled_prompt_tokens": int,
46+ // "session_reset_reason": str
47+ // (new|exact_prefix|mismatch|dirty|equal),
48+ // "generated_token_ids"?: [int,...]} // omitted if stop-trimmed
49+ // open/close/reset: {"opened"|"closed"|"reset": true, "session_id": str}
50+ // error: {"error": str, "code"?: str} // capacity_exhausted |
51+ // // unsupported_session
7252
7353#include < nlohmann/json.hpp>
7454
@@ -242,7 +222,12 @@ inline void worker_handle_request(
242222 const auto & d = step_result.get ();
243223 if (d.is_terminal ) {
244224 finish = " stop" ;
245- break ; // terminal step (EOS / cooperative stop): not emitted or counted
225+ // Terminal step (EOS / cooperative stop): the terminal token is neither
226+ // emitted as text nor counted in num_generated -> completion_tokens. This
227+ // is intentional -- completion_tokens reflects the visible completion the
228+ // client received, not internal forward steps; an EOS the user never sees
229+ // is not part of that count.
230+ break ;
246231 }
247232 // The token was forwarded into the cache (pos advanced); track it so the
248233 // resident-ids/position invariant holds. EOS/terminal tokens are not
@@ -264,9 +249,15 @@ inline void worker_handle_request(
264249 if (stop_hit) {
265250 finish = " stop" ; // reached a stop string: drop it and everything after
266251 stop_string = true ;
267- // The emitted text was trimmed at the stop string, so the next turn's
268- // rendered prompt won't be an exact token extension of resident: force a
269- // reset rather than risk a false prefix match.
252+ // Trimming at the stop means the next turn's prompt won't be an exact
253+ // token extension of resident, so force a reset (no false prefix match).
254+ //
255+ // CONTRACT: every *string* stop is non-resumable this way (trim + dirty +
256+ // omit generated_token_ids) -- right for user/request and content-cleanup
257+ // stops, which change visible text. A clean turn terminator stays
258+ // warm-resumable only if the engine surfaces it as a terminal/EOS token
259+ // id (handled above via d.is_terminal; e.g. Qwen adds <|im_end|> to
260+ // eos_ids).
270261 st.dirty = true ;
271262 break ;
272263 }
@@ -400,8 +391,8 @@ class WorkerSessions {
400391// Emit {"ready": true, ...}, then read JSONL requests from stdin and dispatch
401392// each (generate / open / close / reset), reporting exceptions as
402393// {"error": ...} and continuing to serve. Returns 0 when stdin closes.
403- // enable_warm_resume gates V2b.1 warm suffix reuse for named sessions (off ->
404- // every request resets, the V2a behavior ; useful for A/B measurement).
394+ // enable_warm_resume gates warm suffix reuse for named sessions (off -> every
395+ // request resets and re-prefills ; useful for A/B measurement).
405396inline int run_worker_stdio_loop (
406397 LLMEngine& engine,
407398 ::tokenizers::Tokenizer& tokenizer,
0 commit comments