jd-opensource
diff --git a/‎xllm/core/common/global_flags.h‎
Lines changed: 3 additions & 0 deletions b/‎xllm/core/common/global_flags.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎xllm/core/framework/config/execution_config.cpp‎
Lines changed: 16 additions & 0 deletions b/‎xllm/core/framework/config/execution_config.cpp‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎xllm/core/framework/config/execution_config.h‎
Lines changed: 6 additions & 0 deletions b/‎xllm/core/framework/config/execution_config.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎xllm/core/framework/model/causal_vlm.h‎
Lines changed: 9 additions & 0 deletions b/‎xllm/core/framework/model/causal_vlm.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎xllm/core/framework/model/model_traits.h‎
Lines changed: 12 additions & 0 deletions b/‎xllm/core/framework/model/model_traits.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎xllm/core/framework/model_context.cpp‎
Lines changed: 2 additions & 1 deletion b/‎xllm/core/framework/model_context.cpp‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎xllm/core/layers/npu/npu_base_layer.cpp‎
Lines changed: 15 additions & 2 deletions b/‎xllm/core/layers/npu/npu_base_layer.cpp‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎xllm/core/runtime/options.h‎
Lines changed: 4 additions & 0 deletions b/‎xllm/core/runtime/options.h‎
Lines changed: 4 additions & 0 deletions
@@ -151,6 +151,9 @@ DECLARE_int32(max_tokens_for_graph_mode);
 
 DECLARE_int32(acl_graph_decode_batch_size_limit);
 
+DECLARE_bool(enable_encoder_graph);
+DECLARE_string(encoder_graph_budgets);
+
 DECLARE_bool(enable_chunked_prefill);
 
 DECLARE_string(master_node_addr);
 
@@ -62,6 +62,14 @@ DEFINE_int32(acl_graph_decode_batch_size_limit,
              "When actual decode batch_size > this value, ACL graph decode "
              "falls back to eager mode to avoid OOM.");
 
+DEFINE_bool(enable_encoder_graph,
+            false,
+            "Whether to enable ACL graph for vision encoder");
+
+DEFINE_string(encoder_graph_budgets,
+              "1024,2048,4096,8192",
+              "Comma-separated token budgets for encoder graph buckets");
+
 DEFINE_bool(enable_shm,
             false,
             "Whether to enable shared memory for executing model.");
@@ -91,6 +99,8 @@ void ExecutionConfig::from_flags() {
   XLLM_CONFIG_ASSIGN_FROM_FLAG(enable_graph_vmm_pool);
   XLLM_CONFIG_ASSIGN_FROM_FLAG(max_tokens_for_graph_mode);
   XLLM_CONFIG_ASSIGN_FROM_FLAG(acl_graph_decode_batch_size_limit);
+  XLLM_CONFIG_ASSIGN_FROM_FLAG(enable_encoder_graph);
+  XLLM_CONFIG_ASSIGN_FROM_FLAG(encoder_graph_budgets);
   XLLM_CONFIG_ASSIGN_FROM_FLAG(enable_shm);
   XLLM_CONFIG_ASSIGN_FROM_FLAG(use_contiguous_input_buffer);
   XLLM_CONFIG_ASSIGN_FROM_FLAG(input_shm_size);
@@ -106,6 +116,8 @@ void ExecutionConfig::from_json(const JsonReader& json) {
   XLLM_CONFIG_ASSIGN_FROM_JSON(enable_graph_vmm_pool);
   XLLM_CONFIG_ASSIGN_FROM_JSON(max_tokens_for_graph_mode);
   XLLM_CONFIG_ASSIGN_FROM_JSON(acl_graph_decode_batch_size_limit);
+  XLLM_CONFIG_ASSIGN_FROM_JSON(enable_encoder_graph);
+  XLLM_CONFIG_ASSIGN_FROM_JSON(encoder_graph_budgets);
   XLLM_CONFIG_ASSIGN_FROM_JSON(enable_shm);
   XLLM_CONFIG_ASSIGN_FROM_JSON(use_contiguous_input_buffer);
   XLLM_CONFIG_ASSIGN_FROM_JSON(input_shm_size);
@@ -130,6 +142,10 @@ void ExecutionConfig::append_config_json(
       config_json, default_config, max_tokens_for_graph_mode);
   APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
       config_json, default_config, acl_graph_decode_batch_size_limit);
+  APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
+      config_json, default_config, enable_encoder_graph);
+  APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
+      config_json, default_config, encoder_graph_budgets);
   APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
       config_json, default_config, enable_shm);
   APPEND_CONFIG_JSON_VALUE_IF_NOT_DEFAULT(
 
@@ -47,6 +47,8 @@ class ExecutionConfig final {
          "enable_graph_vmm_pool",
          "max_tokens_for_graph_mode",
          "acl_graph_decode_batch_size_limit",
+         "enable_encoder_graph",
+         "encoder_graph_budgets",
          "enable_shm",
          "use_contiguous_input_buffer",
          "input_shm_size",
@@ -69,6 +71,10 @@ class ExecutionConfig final {
 
   PROPERTY(int32_t, acl_graph_decode_batch_size_limit) = 16;
 
+  PROPERTY(bool, enable_encoder_graph) = false;
+
+  PROPERTY(std::string, encoder_graph_budgets) = "1024,2048,4096,8192";
+
   PROPERTY(bool, enable_shm) = false;
 
   PROPERTY(bool, use_contiguous_input_buffer) = true;
 
@@ -36,6 +36,8 @@ class CausalVLM : public CausalLM {
   virtual torch::Tensor get_input_embeddings(
       const torch::Tensor& input_ids,
       const ModelInputParams& input_params) = 0;
+  virtual void init_encoder_graph_manager(const ModelArgs& args,
+                                          const torch::Device& device) {}
 };
 
 template <typename Model>
@@ -161,6 +163,13 @@ class CausalVLMImpl : public CausalVLM {
 
   torch::Device device() const override { return options_.device(); }
 
+  void init_encoder_graph_manager(const ModelArgs& args,
+                                  const torch::Device& device) override {
+    if constexpr (detail::has_init_encoder_graph_manager<Model>::value) {
+      model_->init_encoder_graph_manager(args, device);
+    }
+  }
+
   const torch::TensorOptions& options() const override { return options_; }
 
  private:
 
@@ -25,6 +25,8 @@ namespace xllm {
 struct ModelInputParams;
 struct ModelGraphMetadataState;
 
+class ModelLoader;
+
 namespace layer {
 class LmHead;
 class WordEmbedding;
@@ -217,6 +219,16 @@ struct has_init_or_refresh_rolling_runtime<
         std::declval<int32_t>(),
         std::declval<const std::string&>()))>> : std::true_type {};
 
+template <typename T, typename = void>
+struct has_init_encoder_graph_manager : std::false_type {};
+
+template <typename T>
+struct has_init_encoder_graph_manager<
+    T,
+    std::void_t<decltype(std::declval<T>()->init_encoder_graph_manager(
+        std::declval<const ModelArgs&>(),
+        std::declval<const torch::Device&>()))>> : std::true_type {};
+
 #endif
 }  // namespace detail
 }  // namespace xllm
@@ -41,7 +41,8 @@ bool should_enable_async_tiling_copy_stream() {
   // ATB copy-stream teardown is not reversible for the same context on the
   // current CANN/PTA stack, so contexts that may enter graph capture must not
   // pre-create the helper stream.
-  if (::xllm::ExecutionConfig::get_instance().enable_graph()) {
+  if (::xllm::ExecutionConfig::get_instance().enable_graph() ||
+      ::xllm::ExecutionConfig::get_instance().enable_encoder_graph()) {
     return false;
   }
   return util::get_bool_env("ATB_USE_TILING_COPY_STREAM", false);
 
@@ -81,7 +81,8 @@ atb::Status BaseLayer::execute_node(atb_speed::Model::Node& node,
   //   However, libtorch_npu current stream is set to default stream after
   //   capture ends, causing inconsistency between ATB context and the actual
   //   execution stream
-  if (::xllm::ExecutionConfig::get_instance().enable_graph()) {
+  if (::xllm::ExecutionConfig::get_instance().enable_graph() ||
+      ::xllm::ExecutionConfig::get_instance().enable_encoder_graph()) {
     void* stream = c10_npu::getCurrentNPUStream(device_.index()).stream();
     context_->SetExecuteStream(stream);
   }
@@ -103,7 +104,19 @@ atb::Status BaseLayer::execute_node(atb_speed::Model::Node& node,
   atb::Status st =
       node.operation->Setup(node.variantPack, node.workspaceSize, context_);
   if (st != 0) {
-    LOG(ERROR) << " setup layer node fail, not call execute";
+    LOG(ERROR) << " setup layer node fail, not call execute, name=" << name_;
+    for (size_t i = 0; i < node.variantPack.inTensors.size(); ++i) {
+      auto& t = node.variantPack.inTensors.at(i);
+      LOG(ERROR) << "  inTensor[" << i << "] shape=[" << t.desc.shape.dimNum
+                 << " dims] dtype=" << t.desc.dtype
+                 << " format=" << t.desc.format;
+    }
+    for (size_t i = 0; i < node.variantPack.outTensors.size(); ++i) {
+      auto& t = node.variantPack.outTensors.at(i);
+      LOG(ERROR) << "  outTensor[" << i << "] shape=[" << t.desc.shape.dimNum
+                 << " dims] dtype=" << t.desc.dtype
+                 << " format=" << t.desc.format;
+    }
     return st;
   }
 
 
@@ -248,6 +248,10 @@ struct Options {
   // maximum number of tokens for graph execution
   PROPERTY(int32_t, max_tokens_for_graph_mode) = 2048;
 
+  PROPERTY(bool, enable_encoder_graph) = false;
+
+  PROPERTY(std::string, encoder_graph_budgets) = "1024,2048,4096,8192";
+
   // beam width for beam search
   PROPERTY(int32_t, beam_width) = 128;