@@ -73,10 +73,11 @@ Result<uint64_t> read_sampled_token(
7373#endif
7474}
7575
76- // Build a Qwen Module with shared mutable arenas (so prefill and decode share
77- // KV/conv/recurrent state) and, on CUDA, the weight-sharing/cuda-graph backend
78- // options that MUST be set before load_method. Loads the prefill+decode methods
79- // (this is the heavy ~weights load). Shared by create_session() and reset().
76+ // Build the one shared Qwen Module: shared mutable arenas (so prefill and
77+ // decode share KV/conv/recurrent state) and, on CUDA, the weight-sharing
78+ // backend option that MUST be set before load_method. Loads the prefill+decode
79+ // methods once (the heavy ~weights load). Called once when the engine is
80+ // created.
8081Result<std::unique_ptr<Module>> build_qwen_module (
8182 const Qwen35MoEConfig& config) {
8283 std::vector<std::string> data_files;
@@ -187,8 +188,8 @@ class Qwen35MoESession : public LLMSession {
187188 tokenizer_(tokenizer),
188189 metadata_(std::move(metadata)),
189190 eos_ids_(std::move(eos_ids)) {
190- // Persistent single-step decode buffers: stable addresses are required so
191- // CUDA-graph capture (which records buffer pointers) can replay each step.
191+ // Persistent single-step decode buffers, reused (updated in place) across
192+ // decode steps to avoid per- step reallocation .
192193 decode_tokens_ = from_blob (
193194 decode_token_data_, {1 , 1 }, executorch::aten::ScalarType::Long);
194195 decode_pos_ =
@@ -457,7 +458,7 @@ class Qwen35MoESession : public LLMSession {
457458 float temperature_ = -1 .0f ;
458459 std::atomic<bool > stop_{false };
459460
460- // Persistent single-step decode buffers (stable addresses for CUDA graph ).
461+ // Persistent single-step decode buffers (reused across decode steps ).
461462 int64_t decode_token_data_[1 ] = {0 };
462463 int64_t decode_pos_data_[1 ] = {0 };
463464 TensorPtr decode_tokens_;
0 commit comments