Skip to content

Commit b6ce438

Browse files
committed
[UPDATE] Update
[ghstack-poisoned]
2 parents a6f5d73 + 5f302dd commit b6ce438

11 files changed

Lines changed: 809 additions & 75 deletions

File tree

examples/models/qwen3_5_moe/CMakeLists.txt

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,19 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
9090
target_link_options_gc_sections(qwen3_5_moe_worker)
9191
target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
9292
endif()
93+
94+
# GPU no-bleed integration proof (per-execute interleaving of two sessions on
95+
# one physical model). Self-asserting; skips cleanly without a device/model.
96+
# Real guard for V2 mutable-state isolation -- run on the nightly/manual GPU
97+
# job: QWEN_MODEL_PATH=... QWEN_DATA_PATH=... QWEN_TOKENIZER_PATH=... ctest -R
98+
# qwen_nobleed
99+
enable_testing()
100+
add_executable(
101+
test_qwen35_moe_nobleed test_qwen35_moe_nobleed.cpp qwen35_moe_engine.cpp
102+
)
103+
target_include_directories(
104+
test_qwen35_moe_nobleed PUBLIC ${_common_include_directories}
105+
${_json_include}
106+
)
107+
target_link_libraries(test_qwen35_moe_nobleed PUBLIC ${link_libraries})
108+
add_test(NAME qwen_nobleed COMMAND test_qwen35_moe_nobleed)
Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,272 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// GPU no-bleed integration proof for the CUDA V2 per-session mutable-state
10+
// rebind -- the REAL guard for mutable-buffer completeness (an under-declared
11+
// buffer would be shared across sessions; only behavior catches that, not the
12+
// declared-subset-of-discovered bookkeeping check). This is the automated form
13+
// of the manual "A solo / A inter" proof in the V2 foundation commit.
14+
//
15+
// CRITICAL: sessions are interleaved at EXECUTE granularity (A prefill, B
16+
// prefill, A decode, B decode, ...). The mechanism under test is the
17+
// per-execute rebind, so running A-to-completion then B would pass even with a
18+
// broken rebind.
19+
//
20+
// GPU-gated: requires a CUDA device + an exported model. Set QWEN_MODEL_PATH,
21+
// QWEN_DATA_PATH, QWEN_TOKENIZER_PATH. Skips cleanly (exit 0) if unset or the
22+
// engine cannot be created (no device) -- so it is safe in CI; the real run is
23+
// the nightly/manual GPU job.
24+
25+
#include <executorch/examples/models/qwen3_5_moe/qwen35_moe_engine.h>
26+
27+
#include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
28+
29+
#include <cstdint>
30+
#include <cstdio>
31+
#include <cstdlib>
32+
#include <memory>
33+
#include <string>
34+
#include <vector>
35+
36+
#include <cuda_runtime.h>
37+
38+
namespace llm = ::executorch::extension::llm;
39+
using ::executorch::runtime::Error;
40+
41+
namespace {
42+
int g_failures = 0;
43+
void check(const char* name, bool ok) {
44+
printf(" [%s] %s\n", ok ? "PASS" : "FAIL", name);
45+
if (!ok) {
46+
++g_failures;
47+
}
48+
}
49+
50+
const char* env(const char* k) {
51+
const char* v = std::getenv(k);
52+
return (v && *v) ? v : nullptr;
53+
}
54+
55+
std::vector<uint64_t> encode(llm::Qwen35MoEEngine& e, const std::string& s) {
56+
auto r = e.tokenizer()->encode(s);
57+
return r.ok() ? std::move(*r) : std::vector<uint64_t>{};
58+
}
59+
60+
// Greedy-decode `n` non-terminal tokens from a freshly-prefilled session.
61+
std::vector<uint64_t>
62+
solo_decode(llm::LLMSession& s, std::vector<uint64_t> prompt, int n) {
63+
llm::SamplingConfig samp; // temperature -1 => greedy/argmax (deterministic)
64+
std::vector<uint64_t> out;
65+
if (s.prefill_tokens(prompt, &samp) != Error::Ok) {
66+
return out;
67+
}
68+
for (int i = 0; i < n; ++i) {
69+
auto r = s.decode_one(samp);
70+
if (r.error() != Error::Ok || r.get().is_terminal) {
71+
break;
72+
}
73+
out.push_back(r.get().token_id);
74+
}
75+
return out;
76+
}
77+
78+
int64_t gpu_free() {
79+
size_t free = 0, total = 0;
80+
return cudaMemGetInfo(&free, &total) == cudaSuccess
81+
? static_cast<int64_t>(free)
82+
: -1;
83+
}
84+
85+
// GPU-FREE fall-closed DEFAULTS of cuda_mutable_state (no device, no handle).
86+
// Covers only the safety defaults -- the descriptor build, positive coverage,
87+
// bytes_per_session sum, and symbols_available AND-fold are exercised
88+
// BEHAVIORALLY by the no-bleed integration test below (the real guard); a
89+
// GPU-free unit test of those branches would need a build_descriptors allocator
90+
// seam / fake-handle harness and is a knowingly-deferred follow-up.
91+
namespace cu = ::executorch::backends::cuda;
92+
void test_mutable_state_fallclosed_defaults() {
93+
printf("cuda_mutable_state fall-closed defaults (GPU-free):\n");
94+
const cu::MutableStateContext bad = 999999; // never created
95+
cu::MutableStateContext c1 = cu::mutable_state_create_context();
96+
cu::MutableStateContext c2 = cu::mutable_state_create_context();
97+
check("context ids are distinct/monotonic", c2 > c1);
98+
check(
99+
"fresh context: rebinding unavailable (no handle)",
100+
!cu::mutable_state_available(c1));
101+
check(
102+
"bytes_per_session: 0 for fresh and unknown",
103+
cu::mutable_state_bytes_per_session(c1) == 0 &&
104+
cu::mutable_state_bytes_per_session(bad) == 0);
105+
check(
106+
"validate_coverage: unknown ctx -> InvalidArgument",
107+
cu::mutable_state_validate_coverage(bad) == Error::InvalidArgument);
108+
check(
109+
"validate_coverage: no symbols -> NotSupported (fall closed)",
110+
cu::mutable_state_validate_coverage(c1) == Error::NotSupported);
111+
// Declaring FQNs without symbols still falls closed (the check is gated on
112+
// symbols, so it never wrongly passes coverage with nothing discovered).
113+
cu::mutable_state_register_fqns(c1, {"a.b", "c.d"});
114+
check(
115+
"validate_coverage: declared-but-no-symbols still NotSupported",
116+
cu::mutable_state_validate_coverage(c1) == Error::NotSupported);
117+
check(
118+
"create_session: unknown ctx -> InvalidArgument",
119+
cu::mutable_state_create_session(bad).error() == Error::InvalidArgument);
120+
check(
121+
"create_session: no symbols -> NotSupported",
122+
cu::mutable_state_create_session(c1).error() == Error::NotSupported);
123+
cu::mutable_state_destroy_session(bad, 0); // no-op, must not crash
124+
cu::mutable_state_destroy_context(bad); // no-op, must not crash
125+
cu::mutable_state_destroy_context(c1);
126+
cu::mutable_state_destroy_context(c2);
127+
check("destroy of unknown ctx/session is a safe no-op", true);
128+
}
129+
130+
} // namespace
131+
132+
int main() {
133+
// GPU-free fall-closed defaults always run (even when the integration part
134+
// skips for lack of a device).
135+
test_mutable_state_fallclosed_defaults();
136+
137+
const char* model = env("QWEN_MODEL_PATH");
138+
const char* tok = env("QWEN_TOKENIZER_PATH");
139+
if (!model || !tok) {
140+
printf(
141+
"SKIP: integration proof needs QWEN_MODEL_PATH / QWEN_TOKENIZER_PATH "
142+
"(+ QWEN_DATA_PATH) on a CUDA box.\n");
143+
return g_failures ? 1 : 0;
144+
}
145+
llm::Qwen35MoEConfig config;
146+
config.model_path = model;
147+
config.data_path = env("QWEN_DATA_PATH") ? env("QWEN_DATA_PATH") : "";
148+
config.tokenizer_path = tok;
149+
config.max_sessions = 4;
150+
151+
auto engine_r = llm::Qwen35MoEEngine::create(config);
152+
if (engine_r.error() != Error::Ok) {
153+
printf("SKIP: engine create failed (no CUDA device / bad paths).\n");
154+
return 0;
155+
}
156+
auto engine = std::move(engine_r.get());
157+
printf("no-bleed integration proof:\n");
158+
159+
const int kN = 24;
160+
auto prompt_a = encode(*engine, "List three colors:");
161+
auto prompt_b =
162+
encode(*engine, "Name two countries in Europe and explain why.");
163+
check("prompts encoded", !prompt_a.empty() && !prompt_b.empty());
164+
165+
// (1) Session A solo -> baseline greedy ids.
166+
auto sa_r = engine->create_session();
167+
check("create session A", sa_r.error() == Error::Ok);
168+
std::vector<uint64_t> ids_solo;
169+
if (sa_r.error() == Error::Ok) {
170+
auto sa = std::move(sa_r.get());
171+
ids_solo = solo_decode(*sa, prompt_a, kN);
172+
}
173+
check("solo produced tokens", !ids_solo.empty());
174+
175+
// (2) A2 and B interleaved at EXECUTE granularity.
176+
auto a2_r = engine->create_session();
177+
auto b_r = engine->create_session();
178+
check("create A2 + B", a2_r.error() == Error::Ok && b_r.error() == Error::Ok);
179+
std::vector<uint64_t> ids_a2, ids_b;
180+
if (a2_r.error() == Error::Ok && b_r.error() == Error::Ok) {
181+
auto a2 = std::move(a2_r.get());
182+
auto b = std::move(b_r.get());
183+
llm::SamplingConfig samp;
184+
bool ok = a2->prefill_tokens(prompt_a, &samp) == Error::Ok && // A prefill
185+
b->prefill_tokens(prompt_b, &samp) == Error::Ok; // then B prefill
186+
check("interleaved prefills", ok);
187+
bool a_done = false, b_done = false;
188+
for (int i = 0; i < kN && ok; ++i) {
189+
if (!a_done) { // A decode
190+
auto r = a2->decode_one(samp);
191+
if (r.error() != Error::Ok || r.get().is_terminal) {
192+
a_done = true;
193+
} else {
194+
ids_a2.push_back(r.get().token_id);
195+
}
196+
}
197+
if (!b_done) { // B decode (between A's steps)
198+
auto r = b->decode_one(samp);
199+
if (r.error() != Error::Ok || r.get().is_terminal) {
200+
b_done = true;
201+
} else {
202+
ids_b.push_back(r.get().token_id);
203+
}
204+
}
205+
}
206+
}
207+
208+
// THE no-bleed assertion: A's interleaved output is bit-identical to A solo
209+
// (greedy is deterministic), so B's interleaved session state did not corrupt
210+
// A's -- i.e. each session's mutable buffers are truly isolated.
211+
check(
212+
"no bleed: A interleaved == A solo (bit-identical)", ids_a2 == ids_solo);
213+
// Sanity that B actually ran a different conversation (else the test is
214+
// vacuous).
215+
check("B ran a distinct conversation", !ids_b.empty() && ids_b != ids_solo);
216+
217+
// (3) Per-extra-session memory is STATE-sized, not a second model load.
218+
// Per-session buffers are allocated LAZILY on first execute (rebind), not at
219+
// create_session(), so measure the free-memory delta around a fresh session's
220+
// first prefill.
221+
const int64_t est = engine->serving_capacity().estimated_bytes_per_session;
222+
{
223+
int64_t free_before = gpu_free();
224+
auto extra_r = engine->create_session();
225+
if (extra_r.error() == Error::Ok) {
226+
auto extra = std::move(extra_r.get());
227+
llm::SamplingConfig samp;
228+
extra->prefill_tokens(
229+
prompt_a, &samp); // first execute -> allocates state
230+
int64_t free_after = gpu_free();
231+
if (free_before > 0 && free_after > 0) {
232+
const int64_t delta = free_before - free_after;
233+
printf(
234+
" extra-session GPU delta=%lld bytes (est/session=%lld)\n",
235+
(long long)delta,
236+
(long long)est);
237+
check(
238+
"extra session is state-sized (>0, < 4 GB, not an ~18 GB reload)",
239+
delta > 0 && delta < (4LL << 30));
240+
if (est > 0) {
241+
check(
242+
"memory delta within 2x of estimated_bytes_per_session",
243+
delta <= est * 2 + (256LL << 20));
244+
}
245+
}
246+
}
247+
} // extra released here -> frees its slot before the capacity test
248+
249+
// (4) Capacity: the (max_sessions+1)th create_session fails (no silent
250+
// share). The sessions above already hold slots; create up to capacity then
251+
// one more.
252+
std::vector<std::unique_ptr<llm::LLMSession>> held;
253+
while (true) {
254+
auto r = engine->create_session();
255+
if (r.error() != Error::Ok) {
256+
break;
257+
}
258+
held.push_back(std::move(r.get()));
259+
if (held.size() > (size_t)config.max_sessions + 2) {
260+
break; // guard against a non-enforcing backend
261+
}
262+
}
263+
check(
264+
"capacity enforced: create_session fails past max_sessions",
265+
held.size() <= (size_t)config.max_sessions);
266+
267+
printf(
268+
"\n%s (%d failure(s))\n",
269+
g_failures ? "FAILURES" : "ALL PASS",
270+
g_failures);
271+
return g_failures ? 1 : 0;
272+
}

extension/llm/runner/llm_session.h

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,11 @@ struct LLMServingCapacity {
6161
// sessions would copy the whole model); raise only on a backend proven to
6262
// share packed weights.
6363
int32_t max_physical_sessions_without_weight_duplication = 1;
64-
// Planned bytes one session adds (KV + activations), for memory-budget
65-
// admission. 0 = unknown; the server skips the memory clamp.
64+
// Planned bytes one session adds (KV + activations). Reported for a FUTURE
65+
// memory-budget admission policy; NOT yet enforced -- admission is currently
66+
// by session COUNT only (--max-sessions). Over-provisioning therefore fails
67+
// at the first execute (cudaMalloc) of the over-committed session, not at
68+
// admit time. 0 = unknown.
6669
int64_t estimated_bytes_per_session = 0;
6770
};
6871

@@ -79,14 +82,28 @@ class ET_EXPERIMENTAL LLMSession {
7982
/// `initial_sampling` (optional): the sampling config for the FIRST generated
8083
/// token, for backends that sample during prefill (e.g. in-graph sampling).
8184
/// Pass it so the first token uses the request's sampling instead of a stale
82-
/// default. Backends that only sample in decode_one() ignore it.
85+
/// default. Backends that only sample in decode_one() ignore it. NOTE:
86+
/// because the first token is sampled here, it does NOT pass through
87+
/// decode_one()'s logit processors -- a grammar/tool mask that must constrain
88+
/// the opening token is not applied to it (a known limitation for
89+
/// grammar-constrained serving).
90+
///
91+
/// ERROR CONTRACT: an error may be returned AFTER backend state has already
92+
/// mutated. On any error from prefill_tokens()/decode_one(), the session is
93+
/// POISONED -- position() may no longer agree with the resident KV. The
94+
/// caller must call reset() (and only proceed once it returns Ok) before any
95+
/// further prefill/decode; it must NOT retry the failed call. The serving
96+
/// worker enforces this (marks the session dirty and forces a reset next
97+
/// request).
8398
virtual ::executorch::runtime::Error prefill_tokens(
8499
std::vector<uint64_t> tokens,
85100
const SamplingConfig* initial_sampling = nullptr) = 0;
86101

87102
/// Decode one token from the pending state; looping reproduces a full
88103
/// generation while returning exact sampled token ids. A single decode_one()
89104
/// runs one forward pass and is not interruptible mid-call (see stop()).
105+
/// On error the session is poisoned -- see the error contract on
106+
/// prefill_tokens() (reset() before any further use; never retry).
90107
virtual ::executorch::runtime::Result<DecodeResult> decode_one(
91108
const SamplingConfig& sampling) = 0;
92109

extension/llm/server/python/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,14 @@ Session capacity is determined by the worker/engine — a single worker hosts ma
160160
isolated sessions on one weight load — so `--num-runners` accepts 1; extra worker
161161
processes would each carry their own copy of the weights.
162162

163+
The **generic `text_llm_worker` is scratch-only (V1)**: `TextLLMEngine::serving_capacity()`
164+
is a conservative 1, so `max_named = max(0, capacity-1) = 0` — the default
165+
`server.py` serves only the anonymous scratch session (no named `session_id`s, no
166+
warm resume). The named-session / warm-resume / token-ID machinery is exercised
167+
by a model-specific worker whose engine reports capacity > 1 (the Qwen3.5-MoE CUDA
168+
worker). This is intentional; the generic worker stays minimal until a backend is
169+
proven to host multiple physical sessions without duplicating weights.
170+
163171
Cancellation is best-effort: a worker request runs to completion and is not
164172
interruptible mid-generation in V1, so `runner.stop()` means "the control plane
165173
stops consuming and the worker finishes the current request" rather than a hard

0 commit comments

Comments
 (0)