Skip to content

Commit 394b0c1

Browse files
committed
[UPDATE] Update
[ghstack-poisoned]
1 parent 76dd40c commit 394b0c1

4 files changed

Lines changed: 19 additions & 15 deletions

File tree

examples/models/qwen3_5_moe/main.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,9 @@ int main(int argc, char** argv) {
134134
stats.num_prompt_tokens = num_prompt_tokens;
135135

136136
// Warmup + timed iterations on one loaded session (reset between). The first
137-
// FLAGS_warmup iterations are discarded; they trigger CUDA-graph capture,
138-
// allocator growth, and GPU clock ramp so the timed iterations reflect steady
139-
// state. Text is printed only on the first iteration (coherence check).
137+
// FLAGS_warmup iterations are discarded; they let allocator growth and GPU
138+
// clock ramp settle so the timed iterations reflect steady state. Text is
139+
// printed only on the first iteration (coherence check).
140140
llm::SamplingConfig sampling;
141141
sampling.temperature = static_cast<float>(FLAGS_temperature);
142142
const int total_iters = FLAGS_warmup + std::max(1, FLAGS_num_iters);

examples/models/qwen3_5_moe/qwen35_moe_engine.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,11 @@ Result<uint64_t> read_sampled_token(
7373
#endif
7474
}
7575

76-
// Build a Qwen Module with shared mutable arenas (so prefill and decode share
77-
// KV/conv/recurrent state) and, on CUDA, the weight-sharing/cuda-graph backend
78-
// options that MUST be set before load_method. Loads the prefill+decode methods
79-
// (this is the heavy ~weights load). Shared by create_session() and reset().
76+
// Build the one shared Qwen Module: shared mutable arenas (so prefill and
77+
// decode share KV/conv/recurrent state) and, on CUDA, the weight-sharing
78+
// backend option that MUST be set before load_method. Loads the prefill+decode
79+
// methods once (the heavy ~weights load). Called once when the engine is
80+
// created.
8081
Result<std::unique_ptr<Module>> build_qwen_module(
8182
const Qwen35MoEConfig& config) {
8283
std::vector<std::string> data_files;
@@ -187,8 +188,8 @@ class Qwen35MoESession : public LLMSession {
187188
tokenizer_(tokenizer),
188189
metadata_(std::move(metadata)),
189190
eos_ids_(std::move(eos_ids)) {
190-
// Persistent single-step decode buffers: stable addresses are required so
191-
// CUDA-graph capture (which records buffer pointers) can replay each step.
191+
// Persistent single-step decode buffers, reused (updated in place) across
192+
// decode steps to avoid per-step reallocation.
192193
decode_tokens_ = from_blob(
193194
decode_token_data_, {1, 1}, executorch::aten::ScalarType::Long);
194195
decode_pos_ =
@@ -457,7 +458,7 @@ class Qwen35MoESession : public LLMSession {
457458
float temperature_ = -1.0f;
458459
std::atomic<bool> stop_{false};
459460

460-
// Persistent single-step decode buffers (stable addresses for CUDA graph).
461+
// Persistent single-step decode buffers (reused across decode steps).
461462
int64_t decode_token_data_[1] = {0};
462463
int64_t decode_pos_data_[1] = {0};
463464
TensorPtr decode_tokens_;

examples/models/qwen3_5_moe/qwen35_moe_engine.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@
1212
//
1313
// The public surface is backend-agnostic: the server receives an LLMEngine and
1414
// never branches on CUDA vs MLX. Backend-specific execution (CUDA in-graph
15-
// sampling, weight-sharing/cuda-graph backend options, device sync) is isolated
16-
// behind EXECUTORCH_BUILD_CUDA inside the .cpp; those isolated points are where
17-
// an MLX runtime would slot in. MLX is NOT implemented or validated here.
15+
// sampling, the weight-sharing backend option, per-session mutable rebinding,
16+
// device sync) is isolated behind EXECUTORCH_BUILD_CUDA inside the .cpp; those
17+
// isolated points are where an MLX runtime would slot in. MLX is NOT
18+
// implemented or validated here.
1819
//
1920
// V2 (CUDA): the ENGINE is multi-session — one shared Module (weights loaded
2021
// once); create_session() hands out multiple logical sessions, each rebinding

examples/models/qwen3_5_moe/qwen35_moe_worker.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,10 @@
1818
// process segfaults in the int4 matmul (validated). Here the model runs in a
1919
// plain synchronous loop in its own process, which is reliable.
2020
//
21-
// V1: single-slot (one engine == one ~18GB weight allocation == one session);
22-
// the control plane queues concurrent requests on the resident session.
21+
// Single-slot serving: this worker creates one session and the control plane
22+
// queues concurrent requests on it. (The engine itself can host multiple
23+
// sessions on the one ~18GB weight allocation; exposing that over the worker
24+
// protocol is a follow-up.)
2325

2426
#include <gflags/gflags.h>
2527

0 commit comments

Comments
 (0)