Skip to content

Commit 98ec508

Browse files
committed
feat: add ViT encoder ACL Graph capture for Qwen3-VL
1 parent 77b91c5 commit 98ec508

15 files changed

Lines changed: 579 additions & 14 deletions

xllm/core/common/global_flags.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ DECLARE_int32(max_tokens_for_graph_mode);
151151

152152
DECLARE_int32(acl_graph_decode_batch_size_limit);
153153

154+
DECLARE_bool(enable_encoder_graph);
155+
DECLARE_string(encoder_graph_budgets);
156+
154157
DECLARE_bool(enable_chunked_prefill);
155158

156159
DECLARE_string(master_node_addr);

xllm/core/framework/config/execution_config.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,14 @@ DEFINE_int32(acl_graph_decode_batch_size_limit,
6262
"When actual decode batch_size > this value, ACL graph decode "
6363
"falls back to eager mode to avoid OOM.");
6464

65+
DEFINE_bool(enable_encoder_graph,
66+
false,
67+
"Whether to enable ACL graph for vision encoder");
68+
69+
DEFINE_string(encoder_graph_budgets,
70+
"1024,2048,4096,8192",
71+
"Comma-separated token budgets for encoder graph buckets");
72+
6573
DEFINE_bool(enable_shm,
6674
false,
6775
"Whether to enable shared memory for executing model.");
@@ -91,6 +99,8 @@ void ExecutionConfig::from_flags() {
9199
XLLM_CONFIG_ASSIGN_FROM_FLAG(enable_graph_vmm_pool);
92100
XLLM_CONFIG_ASSIGN_FROM_FLAG(max_tokens_for_graph_mode);
93101
XLLM_CONFIG_ASSIGN_FROM_FLAG(acl_graph_decode_batch_size_limit);
102+
XLLM_CONFIG_ASSIGN_FROM_FLAG(enable_encoder_graph);
103+
XLLM_CONFIG_ASSIGN_FROM_FLAG(encoder_graph_budgets);
94104
XLLM_CONFIG_ASSIGN_FROM_FLAG(enable_shm);
95105
XLLM_CONFIG_ASSIGN_FROM_FLAG(use_contiguous_input_buffer);
96106
XLLM_CONFIG_ASSIGN_FROM_FLAG(input_shm_size);
@@ -106,6 +116,8 @@ void ExecutionConfig::from_json(const JsonReader& json) {
106116
XLLM_CONFIG_ASSIGN_FROM_JSON(enable_graph_vmm_pool);
107117
XLLM_CONFIG_ASSIGN_FROM_JSON(max_tokens_for_graph_mode);
108118
XLLM_CONFIG_ASSIGN_FROM_JSON(acl_graph_decode_batch_size_limit);
119+
XLLM_CONFIG_ASSIGN_FROM_JSON(enable_encoder_graph);
120+
XLLM_CONFIG_ASSIGN_FROM_JSON(encoder_graph_budgets);
109121
XLLM_CONFIG_ASSIGN_FROM_JSON(enable_shm);
110122
XLLM_CONFIG_ASSIGN_FROM_JSON(use_contiguous_input_buffer);
111123
XLLM_CONFIG_ASSIGN_FROM_JSON(input_shm_size);
@@ -130,6 +142,10 @@ void ExecutionConfig::append_config_json(
130142
config_json, default_config, max_tokens_for_graph_mode);
131143
APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
132144
config_json, default_config, acl_graph_decode_batch_size_limit);
145+
APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
146+
config_json, default_config, enable_encoder_graph);
147+
APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
148+
config_json, default_config, encoder_graph_budgets);
133149
APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
134150
config_json, default_config, enable_shm);
135151
APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(

xllm/core/framework/config/execution_config.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ class ExecutionConfig final {
4747
"enable_graph_vmm_pool",
4848
"max_tokens_for_graph_mode",
4949
"acl_graph_decode_batch_size_limit",
50+
"enable_encoder_graph",
51+
"encoder_graph_budgets",
5052
"enable_shm",
5153
"use_contiguous_input_buffer",
5254
"input_shm_size",
@@ -69,6 +71,10 @@ class ExecutionConfig final {
6971

7072
PROPERTY(int32_t, acl_graph_decode_batch_size_limit) = 16;
7173

74+
PROPERTY(bool, enable_encoder_graph) = false;
75+
76+
PROPERTY(std::string, encoder_graph_budgets) = "1024,2048,4096,8192";
77+
7278
PROPERTY(bool, enable_shm) = false;
7379

7480
PROPERTY(bool, use_contiguous_input_buffer) = true;

xllm/core/framework/model/causal_vlm.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ class CausalVLM : public CausalLM {
3636
virtual torch::Tensor get_input_embeddings(
3737
const torch::Tensor& input_ids,
3838
const ModelInputParams& input_params) = 0;
39+
virtual void init_encoder_graph_manager(const ModelArgs& args,
40+
const torch::Device& device) {}
3941
};
4042

4143
template <typename Model>
@@ -161,6 +163,13 @@ class CausalVLMImpl : public CausalVLM {
161163

162164
torch::Device device() const override { return options_.device(); }
163165

166+
void init_encoder_graph_manager(const ModelArgs& args,
167+
const torch::Device& device) override {
168+
if constexpr (detail::has_init_encoder_graph_manager<Model>::value) {
169+
model_->init_encoder_graph_manager(args, device);
170+
}
171+
}
172+
164173
const torch::TensorOptions& options() const override { return options_; }
165174

166175
private:

xllm/core/framework/model/model_traits.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ namespace xllm {
2525
struct ModelInputParams;
2626
struct ModelGraphMetadataState;
2727

28+
class ModelLoader;
29+
2830
namespace layer {
2931
class LmHead;
3032
class WordEmbedding;
@@ -217,6 +219,16 @@ struct has_init_or_refresh_rolling_runtime<
217219
std::declval<int32_t>(),
218220
std::declval<const std::string&>()))>> : std::true_type {};
219221

222+
template <typename T, typename = void>
223+
struct has_init_encoder_graph_manager : std::false_type {};
224+
225+
template <typename T>
226+
struct has_init_encoder_graph_manager<
227+
T,
228+
std::void_t<decltype(std::declval<T>()->init_encoder_graph_manager(
229+
std::declval<const ModelArgs&>(),
230+
std::declval<const torch::Device&>()))>> : std::true_type {};
231+
220232
#endif
221233
} // namespace detail
222234
} // namespace xllm

xllm/core/framework/model_context.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ bool should_enable_async_tiling_copy_stream() {
4141
// ATB copy-stream teardown is not reversible for the same context on the
4242
// current CANN/PTA stack, so contexts that may enter graph capture must not
4343
// pre-create the helper stream.
44-
if (::xllm::ExecutionConfig::get_instance().enable_graph()) {
44+
if (::xllm::ExecutionConfig::get_instance().enable_graph() ||
45+
::xllm::ExecutionConfig::get_instance().enable_encoder_graph()) {
4546
return false;
4647
}
4748
return util::get_bool_env("ATB_USE_TILING_COPY_STREAM", false);

xllm/core/layers/npu/npu_base_layer.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,8 @@ atb::Status BaseLayer::execute_node(atb_speed::Model::Node& node,
8181
// However, libtorch_npu current stream is set to default stream after
8282
// capture ends, causing inconsistency between ATB context and the actual
8383
// execution stream
84-
if (::xllm::ExecutionConfig::get_instance().enable_graph()) {
84+
if (::xllm::ExecutionConfig::get_instance().enable_graph() ||
85+
::xllm::ExecutionConfig::get_instance().enable_encoder_graph()) {
8586
void* stream = c10_npu::getCurrentNPUStream(device_.index()).stream();
8687
context_->SetExecuteStream(stream);
8788
}
@@ -103,7 +104,19 @@ atb::Status BaseLayer::execute_node(atb_speed::Model::Node& node,
103104
atb::Status st =
104105
node.operation->Setup(node.variantPack, node.workspaceSize, context_);
105106
if (st != 0) {
106-
LOG(ERROR) << " setup layer node fail, not call execute";
107+
LOG(ERROR) << " setup layer node fail, not call execute, name=" << name_;
108+
for (size_t i = 0; i < node.variantPack.inTensors.size(); ++i) {
109+
auto& t = node.variantPack.inTensors.at(i);
110+
LOG(ERROR) << " inTensor[" << i << "] shape=[" << t.desc.shape.dimNum
111+
<< " dims] dtype=" << t.desc.dtype
112+
<< " format=" << t.desc.format;
113+
}
114+
for (size_t i = 0; i < node.variantPack.outTensors.size(); ++i) {
115+
auto& t = node.variantPack.outTensors.at(i);
116+
LOG(ERROR) << " outTensor[" << i << "] shape=[" << t.desc.shape.dimNum
117+
<< " dims] dtype=" << t.desc.dtype
118+
<< " format=" << t.desc.format;
119+
}
107120
return st;
108121
}
109122

xllm/core/runtime/options.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@ struct Options {
248248
// maximum number of tokens for graph execution
249249
PROPERTY(int32_t, max_tokens_for_graph_mode) = 2048;
250250

251+
PROPERTY(bool, enable_encoder_graph) = false;
252+
253+
PROPERTY(std::string, encoder_graph_budgets) = "1024,2048,4096,8192";
254+
251255
// beam width for beam search
252256
PROPERTY(int32_t, beam_width) = 128;
253257

0 commit comments

Comments
 (0)