pytorch
diff --git a/‎examples/models/qwen3_5_moe/CMakeLists.txt‎
Lines changed: 16 additions & 0 deletions b/‎examples/models/qwen3_5_moe/CMakeLists.txt‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp‎
Lines changed: 272 additions & 0 deletions b/‎examples/models/qwen3_5_moe/test_qwen35_moe_nobleed.cpp‎
Lines changed: 272 additions & 0 deletions
diff --git a/‎extension/llm/runner/llm_session.h‎
Lines changed: 20 additions & 3 deletions b/‎extension/llm/runner/llm_session.h‎
Lines changed: 20 additions & 3 deletions
diff --git a/‎extension/llm/server/python/README.md‎
Lines changed: 8 additions & 0 deletions b/‎extension/llm/server/python/README.md‎
Lines changed: 8 additions & 0 deletions
@@ -90,3 +90,19 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_worker)
   target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
 endif()
+
+# GPU no-bleed integration proof (per-execute interleaving of two sessions on
+# one physical model). Self-asserting; skips cleanly without a device/model.
+# Real guard for V2 mutable-state isolation -- run on the nightly/manual GPU
+# job: QWEN_MODEL_PATH=... QWEN_DATA_PATH=... QWEN_TOKENIZER_PATH=... ctest -R
+# qwen_nobleed
+enable_testing()
+add_executable(
+  test_qwen35_moe_nobleed test_qwen35_moe_nobleed.cpp qwen35_moe_engine.cpp
+)
+target_include_directories(
+  test_qwen35_moe_nobleed PUBLIC ${_common_include_directories}
+                                 ${_json_include}
+)
+target_link_libraries(test_qwen35_moe_nobleed PUBLIC ${link_libraries})
+add_test(NAME qwen_nobleed COMMAND test_qwen35_moe_nobleed)
@@ -0,0 +1,272 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// GPU no-bleed integration proof for the CUDA V2 per-session mutable-state
+// rebind -- the REAL guard for mutable-buffer completeness (an under-declared
+// buffer would be shared across sessions; only behavior catches that, not the
+// declared-subset-of-discovered bookkeeping check). This is the automated form
+// of the manual "A solo / A inter" proof in the V2 foundation commit.
+//
+// CRITICAL: sessions are interleaved at EXECUTE granularity (A prefill, B
+// prefill, A decode, B decode, ...). The mechanism under test is the
+// per-execute rebind, so running A-to-completion then B would pass even with a
+// broken rebind.
+//
+// GPU-gated: requires a CUDA device + an exported model. Set QWEN_MODEL_PATH,
+// QWEN_DATA_PATH, QWEN_TOKENIZER_PATH. Skips cleanly (exit 0) if unset or the
+// engine cannot be created (no device) -- so it is safe in CI; the real run is
+// the nightly/manual GPU job.
+
+#include <executorch/examples/models/qwen3_5_moe/qwen35_moe_engine.h>
+
+#include <executorch/backends/cuda/runtime/cuda_mutable_state.h>
+
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
+
+namespace {
+int g_failures = 0;
+void check(const char* name, bool ok) {
+  printf("  [%s] %s\n", ok ? "PASS" : "FAIL", name);
+  if (!ok) {
+    ++g_failures;
+  }
+}
+
+const char* env(const char* k) {
+  const char* v = std::getenv(k);
+  return (v && *v) ? v : nullptr;
+}
+
+std::vector<uint64_t> encode(llm::Qwen35MoEEngine& e, const std::string& s) {
+  auto r = e.tokenizer()->encode(s);
+  return r.ok() ? std::move(*r) : std::vector<uint64_t>{};
+}
+
+// Greedy-decode `n` non-terminal tokens from a freshly-prefilled session.
+std::vector<uint64_t>
+solo_decode(llm::LLMSession& s, std::vector<uint64_t> prompt, int n) {
+  llm::SamplingConfig samp; // temperature -1 => greedy/argmax (deterministic)
+  std::vector<uint64_t> out;
+  if (s.prefill_tokens(prompt, &samp) != Error::Ok) {
+    return out;
+  }
+  for (int i = 0; i < n; ++i) {
+    auto r = s.decode_one(samp);
+    if (r.error() != Error::Ok || r.get().is_terminal) {
+      break;
+    }
+    out.push_back(r.get().token_id);
+  }
+  return out;
+}
+
+int64_t gpu_free() {
+  size_t free = 0, total = 0;
+  return cudaMemGetInfo(&free, &total) == cudaSuccess
+      ? static_cast<int64_t>(free)
+      : -1;
+}
+
+// GPU-FREE fall-closed DEFAULTS of cuda_mutable_state (no device, no handle).
+// Covers only the safety defaults -- the descriptor build, positive coverage,
+// bytes_per_session sum, and symbols_available AND-fold are exercised
+// BEHAVIORALLY by the no-bleed integration test below (the real guard); a
+// GPU-free unit test of those branches would need a build_descriptors allocator
+// seam / fake-handle harness and is a knowingly-deferred follow-up.
+namespace cu = ::executorch::backends::cuda;
+void test_mutable_state_fallclosed_defaults() {
+  printf("cuda_mutable_state fall-closed defaults (GPU-free):\n");
+  const cu::MutableStateContext bad = 999999; // never created
+  cu::MutableStateContext c1 = cu::mutable_state_create_context();
+  cu::MutableStateContext c2 = cu::mutable_state_create_context();
+  check("context ids are distinct/monotonic", c2 > c1);
+  check(
+      "fresh context: rebinding unavailable (no handle)",
+      !cu::mutable_state_available(c1));
+  check(
+      "bytes_per_session: 0 for fresh and unknown",
+      cu::mutable_state_bytes_per_session(c1) == 0 &&
+          cu::mutable_state_bytes_per_session(bad) == 0);
+  check(
+      "validate_coverage: unknown ctx -> InvalidArgument",
+      cu::mutable_state_validate_coverage(bad) == Error::InvalidArgument);
+  check(
+      "validate_coverage: no symbols -> NotSupported (fall closed)",
+      cu::mutable_state_validate_coverage(c1) == Error::NotSupported);
+  // Declaring FQNs without symbols still falls closed (the check is gated on
+  // symbols, so it never wrongly passes coverage with nothing discovered).
+  cu::mutable_state_register_fqns(c1, {"a.b", "c.d"});
+  check(
+      "validate_coverage: declared-but-no-symbols still NotSupported",
+      cu::mutable_state_validate_coverage(c1) == Error::NotSupported);
+  check(
+      "create_session: unknown ctx -> InvalidArgument",
+      cu::mutable_state_create_session(bad).error() == Error::InvalidArgument);
+  check(
+      "create_session: no symbols -> NotSupported",
+      cu::mutable_state_create_session(c1).error() == Error::NotSupported);
+  cu::mutable_state_destroy_session(bad, 0); // no-op, must not crash
+  cu::mutable_state_destroy_context(bad); // no-op, must not crash
+  cu::mutable_state_destroy_context(c1);
+  cu::mutable_state_destroy_context(c2);
+  check("destroy of unknown ctx/session is a safe no-op", true);
+}
+
+} // namespace
+
+int main() {
+  // GPU-free fall-closed defaults always run (even when the integration part
+  // skips for lack of a device).
+  test_mutable_state_fallclosed_defaults();
+
+  const char* model = env("QWEN_MODEL_PATH");
+  const char* tok = env("QWEN_TOKENIZER_PATH");
+  if (!model || !tok) {
+    printf(
+        "SKIP: integration proof needs QWEN_MODEL_PATH / QWEN_TOKENIZER_PATH "
+        "(+ QWEN_DATA_PATH) on a CUDA box.\n");
+    return g_failures ? 1 : 0;
+  }
+  llm::Qwen35MoEConfig config;
+  config.model_path = model;
+  config.data_path = env("QWEN_DATA_PATH") ? env("QWEN_DATA_PATH") : "";
+  config.tokenizer_path = tok;
+  config.max_sessions = 4;
+
+  auto engine_r = llm::Qwen35MoEEngine::create(config);
+  if (engine_r.error() != Error::Ok) {
+    printf("SKIP: engine create failed (no CUDA device / bad paths).\n");
+    return 0;
+  }
+  auto engine = std::move(engine_r.get());
+  printf("no-bleed integration proof:\n");
+
+  const int kN = 24;
+  auto prompt_a = encode(*engine, "List three colors:");
+  auto prompt_b =
+      encode(*engine, "Name two countries in Europe and explain why.");
+  check("prompts encoded", !prompt_a.empty() && !prompt_b.empty());
+
+  // (1) Session A solo -> baseline greedy ids.
+  auto sa_r = engine->create_session();
+  check("create session A", sa_r.error() == Error::Ok);
+  std::vector<uint64_t> ids_solo;
+  if (sa_r.error() == Error::Ok) {
+    auto sa = std::move(sa_r.get());
+    ids_solo = solo_decode(*sa, prompt_a, kN);
+  }
+  check("solo produced tokens", !ids_solo.empty());
+
+  // (2) A2 and B interleaved at EXECUTE granularity.
+  auto a2_r = engine->create_session();
+  auto b_r = engine->create_session();
+  check("create A2 + B", a2_r.error() == Error::Ok && b_r.error() == Error::Ok);
+  std::vector<uint64_t> ids_a2, ids_b;
+  if (a2_r.error() == Error::Ok && b_r.error() == Error::Ok) {
+    auto a2 = std::move(a2_r.get());
+    auto b = std::move(b_r.get());
+    llm::SamplingConfig samp;
+    bool ok = a2->prefill_tokens(prompt_a, &samp) == Error::Ok && // A prefill
+        b->prefill_tokens(prompt_b, &samp) == Error::Ok; // then B prefill
+    check("interleaved prefills", ok);
+    bool a_done = false, b_done = false;
+    for (int i = 0; i < kN && ok; ++i) {
+      if (!a_done) { // A decode
+        auto r = a2->decode_one(samp);
+        if (r.error() != Error::Ok || r.get().is_terminal) {
+          a_done = true;
+        } else {
+          ids_a2.push_back(r.get().token_id);
+        }
+      }
+      if (!b_done) { // B decode (between A's steps)
+        auto r = b->decode_one(samp);
+        if (r.error() != Error::Ok || r.get().is_terminal) {
+          b_done = true;
+        } else {
+          ids_b.push_back(r.get().token_id);
+        }
+      }
+    }
+  }
+
+  // THE no-bleed assertion: A's interleaved output is bit-identical to A solo
+  // (greedy is deterministic), so B's interleaved session state did not corrupt
+  // A's -- i.e. each session's mutable buffers are truly isolated.
+  check(
+      "no bleed: A interleaved == A solo (bit-identical)", ids_a2 == ids_solo);
+  // Sanity that B actually ran a different conversation (else the test is
+  // vacuous).
+  check("B ran a distinct conversation", !ids_b.empty() && ids_b != ids_solo);
+
+  // (3) Per-extra-session memory is STATE-sized, not a second model load.
+  // Per-session buffers are allocated LAZILY on first execute (rebind), not at
+  // create_session(), so measure the free-memory delta around a fresh session's
+  // first prefill.
+  const int64_t est = engine->serving_capacity().estimated_bytes_per_session;
+  {
+    int64_t free_before = gpu_free();
+    auto extra_r = engine->create_session();
+    if (extra_r.error() == Error::Ok) {
+      auto extra = std::move(extra_r.get());
+      llm::SamplingConfig samp;
+      extra->prefill_tokens(
+          prompt_a, &samp); // first execute -> allocates state
+      int64_t free_after = gpu_free();
+      if (free_before > 0 && free_after > 0) {
+        const int64_t delta = free_before - free_after;
+        printf(
+            "    extra-session GPU delta=%lld bytes (est/session=%lld)\n",
+            (long long)delta,
+            (long long)est);
+        check(
+            "extra session is state-sized (>0, < 4 GB, not an ~18 GB reload)",
+            delta > 0 && delta < (4LL << 30));
+        if (est > 0) {
+          check(
+              "memory delta within 2x of estimated_bytes_per_session",
+              delta <= est * 2 + (256LL << 20));
+        }
+      }
+    }
+  } // extra released here -> frees its slot before the capacity test
+
+  // (4) Capacity: the (max_sessions+1)th create_session fails (no silent
+  // share). The sessions above already hold slots; create up to capacity then
+  // one more.
+  std::vector<std::unique_ptr<llm::LLMSession>> held;
+  while (true) {
+    auto r = engine->create_session();
+    if (r.error() != Error::Ok) {
+      break;
+    }
+    held.push_back(std::move(r.get()));
+    if (held.size() > (size_t)config.max_sessions + 2) {
+      break; // guard against a non-enforcing backend
+    }
+  }
+  check(
+      "capacity enforced: create_session fails past max_sessions",
+      held.size() <= (size_t)config.max_sessions);
+
+  printf(
+      "\n%s (%d failure(s))\n",
+      g_failures ? "FAILURES" : "ALL PASS",
+      g_failures);
+  return g_failures ? 1 : 0;
+}
@@ -61,8 +61,11 @@ struct LLMServingCapacity {
   // sessions would copy the whole model); raise only on a backend proven to
   // share packed weights.
   int32_t max_physical_sessions_without_weight_duplication = 1;
-  // Planned bytes one session adds (KV + activations), for memory-budget
-  // admission. 0 = unknown; the server skips the memory clamp.
+  // Planned bytes one session adds (KV + activations). Reported for a FUTURE
+  // memory-budget admission policy; NOT yet enforced -- admission is currently
+  // by session COUNT only (--max-sessions). Over-provisioning therefore fails
+  // at the first execute (cudaMalloc) of the over-committed session, not at
+  // admit time. 0 = unknown.
   int64_t estimated_bytes_per_session = 0;
 };
 
@@ -79,14 +82,28 @@ class ET_EXPERIMENTAL LLMSession {
   /// `initial_sampling` (optional): the sampling config for the FIRST generated
   /// token, for backends that sample during prefill (e.g. in-graph sampling).
   /// Pass it so the first token uses the request's sampling instead of a stale
-  /// default. Backends that only sample in decode_one() ignore it.
+  /// default. Backends that only sample in decode_one() ignore it. NOTE:
+  /// because the first token is sampled here, it does NOT pass through
+  /// decode_one()'s logit processors -- a grammar/tool mask that must constrain
+  /// the opening token is not applied to it (a known limitation for
+  /// grammar-constrained serving).
+  ///
+  /// ERROR CONTRACT: an error may be returned AFTER backend state has already
+  /// mutated. On any error from prefill_tokens()/decode_one(), the session is
+  /// POISONED -- position() may no longer agree with the resident KV. The
+  /// caller must call reset() (and only proceed once it returns Ok) before any
+  /// further prefill/decode; it must NOT retry the failed call. The serving
+  /// worker enforces this (marks the session dirty and forces a reset next
+  /// request).
   virtual ::executorch::runtime::Error prefill_tokens(
       std::vector<uint64_t> tokens,
       const SamplingConfig* initial_sampling = nullptr) = 0;
 
   /// Decode one token from the pending state; looping reproduces a full
   /// generation while returning exact sampled token ids. A single decode_one()
   /// runs one forward pass and is not interruptible mid-call (see stop()).
+  /// On error the session is poisoned -- see the error contract on
+  /// prefill_tokens() (reset() before any further use; never retry).
   virtual ::executorch::runtime::Result<DecodeResult> decode_one(
       const SamplingConfig& sampling) = 0;
 
 
@@ -160,6 +160,14 @@ Session capacity is determined by the worker/engine — a single worker hosts ma
 isolated sessions on one weight load — so `--num-runners` accepts 1; extra worker
 processes would each carry their own copy of the weights.
 
+The **generic `text_llm_worker` is scratch-only (V1)**: `TextLLMEngine::serving_capacity()`
+is a conservative 1, so `max_named = max(0, capacity-1) = 0` — the default
+`server.py` serves only the anonymous scratch session (no named `session_id`s, no
+warm resume). The named-session / warm-resume / token-ID machinery is exercised
+by a model-specific worker whose engine reports capacity > 1 (the Qwen3.5-MoE CUDA
+worker). This is intentional; the generic worker stays minimal until a backend is
+proven to host multiple physical sessions without duplicating weights.
+
 Cancellation is best-effort: a worker request runs to completion and is not
 interruptible mid-generation in V1, so `runner.stop()` means "the control plane
 stops consuming and the worker finishes the current request" rather than a hard