[UPDATE] Update

mergennachin · mergennachin · commit 394b0c107c59 · 2026-06-08T12:27:54.000-07:00
[ghstack-poisoned]
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
@@ -134,9 +134,9 @@ int main(int argc, char** argv) {
   stats.num_prompt_tokens = num_prompt_tokens;
 
   // Warmup + timed iterations on one loaded session (reset between). The first
-  // FLAGS_warmup iterations are discarded; they trigger CUDA-graph capture,
-  // allocator growth, and GPU clock ramp so the timed iterations reflect steady
-  // state. Text is printed only on the first iteration (coherence check).
+  // FLAGS_warmup iterations are discarded; they let allocator growth and GPU
+  // clock ramp settle so the timed iterations reflect steady state. Text is
+  // printed only on the first iteration (coherence check).
   llm::SamplingConfig sampling;
   sampling.temperature = static_cast<float>(FLAGS_temperature);
   const int total_iters = FLAGS_warmup + std::max(1, FLAGS_num_iters);
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp b/examples/models/qwen3_5_moe/qwen35_moe_engine.cpp
@@ -73,10 +73,11 @@ Result<uint64_t> read_sampled_token(
 #endif
 }
 
-// Build a Qwen Module with shared mutable arenas (so prefill and decode share
-// KV/conv/recurrent state) and, on CUDA, the weight-sharing/cuda-graph backend
-// options that MUST be set before load_method. Loads the prefill+decode methods
-// (this is the heavy ~weights load). Shared by create_session() and reset().
+// Build the one shared Qwen Module: shared mutable arenas (so prefill and
+// decode share KV/conv/recurrent state) and, on CUDA, the weight-sharing
+// backend option that MUST be set before load_method. Loads the prefill+decode
+// methods once (the heavy ~weights load). Called once when the engine is
+// created.
 Result<std::unique_ptr<Module>> build_qwen_module(
     const Qwen35MoEConfig& config) {
   std::vector<std::string> data_files;
@@ -187,8 +188,8 @@ class Qwen35MoESession : public LLMSession {
         tokenizer_(tokenizer),
         metadata_(std::move(metadata)),
         eos_ids_(std::move(eos_ids)) {
-    // Persistent single-step decode buffers: stable addresses are required so
-    // CUDA-graph capture (which records buffer pointers) can replay each step.
+    // Persistent single-step decode buffers, reused (updated in place) across
+    // decode steps to avoid per-step reallocation.
     decode_tokens_ = from_blob(
         decode_token_data_, {1, 1}, executorch::aten::ScalarType::Long);
     decode_pos_ =
@@ -457,7 +458,7 @@ class Qwen35MoESession : public LLMSession {
   float temperature_ = -1.0f;
   std::atomic<bool> stop_{false};
 
-  // Persistent single-step decode buffers (stable addresses for CUDA graph).
+  // Persistent single-step decode buffers (reused across decode steps).
   int64_t decode_token_data_[1] = {0};
   int64_t decode_pos_data_[1] = {0};
   TensorPtr decode_tokens_;
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_engine.h b/examples/models/qwen3_5_moe/qwen35_moe_engine.h
@@ -12,9 +12,10 @@
 //
 // The public surface is backend-agnostic: the server receives an LLMEngine and
 // never branches on CUDA vs MLX. Backend-specific execution (CUDA in-graph
-// sampling, weight-sharing/cuda-graph backend options, device sync) is isolated
-// behind EXECUTORCH_BUILD_CUDA inside the .cpp; those isolated points are where
-// an MLX runtime would slot in. MLX is NOT implemented or validated here.
+// sampling, the weight-sharing backend option, per-session mutable rebinding,
+// device sync) is isolated behind EXECUTORCH_BUILD_CUDA inside the .cpp; those
+// isolated points are where an MLX runtime would slot in. MLX is NOT
+// implemented or validated here.
 //
 // V2 (CUDA): the ENGINE is multi-session — one shared Module (weights loaded
 // once); create_session() hands out multiple logical sessions, each rebinding
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp b/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
@@ -18,8 +18,10 @@
 // process segfaults in the int4 matmul (validated). Here the model runs in a
 // plain synchronous loop in its own process, which is reliable.
 //
-// V1: single-slot (one engine == one ~18GB weight allocation == one session);
-// the control plane queues concurrent requests on the resident session.
+// Single-slot serving: this worker creates one session and the control plane
+// queues concurrent requests on it. (The engine itself can host multiple
+// sessions on the one ~18GB weight allocation; exposing that over the worker
+// protocol is a follow-up.)
 
 #include <gflags/gflags.h>