XNNPACK graph runtime: backend integration behind use_graph_runtime

GregoryComer · GregoryComer · commit bfcfd5d2c8c7 · 2026-06-12T22:19:06.000-07:00
Wires the graph runtime into XnnpackBackend behind the use_graph_runtime backend option (default off, with per-method runtime-spec override). When enabled, init() deserializes via FlatbufferGraphBuilder and builds an Executor; execute()/destroy() gain the graph path alongside the legacy XNNCompiler path. With prefer_in_tree_kernel disabled, existing all-XNNPACK models run end-to-end on the new executor. Authored with Claude. ghstack-source-id: 06e4862 ghstack-comment-id: 4695613250 Pull-Request: #20256
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp
@@ -6,18 +6,25 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/backends/xnnpack/runtime/FlatbufferGraphBuilder.h>
 #include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
 #include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
 #include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
 #include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
 #include <executorch/backends/xnnpack/runtime/XnnpackBackendOptions.h>
+#include <executorch/backends/xnnpack/runtime/executor/executor.h>
 #include <executorch/runtime/backend/interface.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <executorch/runtime/platform/log.h>
+#include <chrono>
 
+#include <cstring>
 #include <memory>
 #include <mutex>
+#include <vector>
 
 #pragma clang diagnostic ignored "-Wglobal-constructors"
 
@@ -41,6 +48,16 @@ using executorch::runtime::FreeableBuffer;
 using executorch::runtime::Result;
 using executorch::runtime::Span;
 
+struct XnnpackDelegateHandle {
+  bool is_graph_runtime = false;
+  // Legacy path: XNNExecutor placed via runtime allocator.
+  xnnpack::delegate::XNNExecutor* legacy_executor = nullptr;
+  // Graph path: heap-allocated Executor.
+  xnnpack::executor::Executor* graph_executor = nullptr;
+  std::vector<uint32_t> input_external_ids;
+  std::vector<uint32_t> output_external_ids;
+};
+
 class XnnpackBackend final
     : public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface {
  public:
@@ -66,26 +83,67 @@ class XnnpackBackend final
       BackendInitContext& context,
       FreeableBuffer* processed,
       ArrayRef<CompileSpec> compile_specs) const override {
+    auto* handle = context.get_runtime_allocator()
+                       ->allocateInstance<XnnpackDelegateHandle>();
+    if (handle == nullptr) {
+      return Error::MemoryAllocationFailed;
+    }
+    new (handle) XnnpackDelegateHandle();
+
+    bool use_graph_runtime = options_.resolve_graph_runtime(context);
+    handle->is_graph_runtime = use_graph_runtime;
+
+    if (use_graph_runtime) {
+      auto t0 = std::chrono::steady_clock::now();
+      const NamedDataMap* named_data_map = context.get_named_data_map();
+      ET_UNWRAP(
+          result,
+          xnnpack::FlatbufferGraphBuilder::build(
+              processed->data(), processed->size(), named_data_map));
+      processed->Free();
+      auto t1 = std::chrono::steady_clock::now();
+
+      ET_UNWRAP(
+          built_executor, xnnpack::executor::Executor::build(result.graph));
+      auto* executor =
+          new xnnpack::executor::Executor(std::move(built_executor));
+      auto t2 = std::chrono::steady_clock::now();
+      handle->graph_executor = executor;
+      handle->input_external_ids = std::move(result.input_external_ids);
+      handle->output_external_ids = std::move(result.output_external_ids);
+      ET_LOG(
+          Info,
+          "Graph runtime init: deserialize=%lldms executor_build=%lldms",
+          (long long)std::chrono::duration_cast<std::chrono::milliseconds>(
+              t1 - t0)
+              .count(),
+          (long long)std::chrono::duration_cast<std::chrono::milliseconds>(
+              t2 - t1)
+              .count());
+      return handle;
+    }
+
     auto executor = context.get_runtime_allocator()
                         ->allocateInstance<xnnpack::delegate::XNNExecutor>();
     if (executor == nullptr) {
+      handle->~XnnpackDelegateHandle();
       return Error::MemoryAllocationFailed;
     }
 
     const NamedDataMap* named_data_map = context.get_named_data_map();
-    // thread safe. This can happen when multiple threads call init() on
-    // the same backend instance.
 
     auto program_id =
         reinterpret_cast<uintptr_t>(context.get_runtime_allocator());
     auto sharing_mode_result = options_.resolve_sharing_mode(context);
     if (!sharing_mode_result.ok()) {
+      handle->~XnnpackDelegateHandle();
       return sharing_mode_result.error();
     }
     auto workspace_result =
         options_.workspace_manager().get_or_create_workspace(
             program_id, sharing_mode_result.get());
     if (!workspace_result.ok()) {
+      handle->~XnnpackDelegateHandle();
       return workspace_result.error();
     }
     auto workspace = workspace_result.get();
@@ -128,23 +186,27 @@ class XnnpackBackend final
     processed->Free();
 
     if (err != Error::Ok) {
-      // destroy() won't be called on this handle, so we need to clean it up
-      // now.
       executor->~XNNExecutor();
-
+      handle->~XnnpackDelegateHandle();
       ET_LOG(
           Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err);
       return err;
     }
-
-    return executor;
+    handle->legacy_executor = executor;
+    return handle;
   }
 
   Error execute(
       BackendExecutionContext& context,
       DelegateHandle* handle,
       Span<EValue*> args) const override {
-    auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
+    auto* delegate = static_cast<XnnpackDelegateHandle*>(handle);
+
+    if (delegate->is_graph_runtime) {
+      return execute_graph(delegate, args);
+    }
+
+    auto executor = delegate->legacy_executor;
 
     auto workspace = executor->get_workspace();
 
@@ -176,7 +238,15 @@ class XnnpackBackend final
 
   void destroy(DelegateHandle* handle) const override {
     if (handle != nullptr) {
-      auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
+      auto* delegate = static_cast<XnnpackDelegateHandle*>(handle);
+
+      if (delegate->is_graph_runtime) {
+        delete delegate->graph_executor;
+        delegate->~XnnpackDelegateHandle();
+        return;
+      }
+
+      auto executor = delegate->legacy_executor;
       auto workspace = executor->get_workspace();
 
       const std::lock_guard<std::mutex> lock_weights_cache(
@@ -200,6 +270,7 @@ class XnnpackBackend final
       // XNNExecutor is not trivially destructible. Since this was constructed
       // manually in init(), we must destroy it manually here.
       executor->~XNNExecutor();
+      delegate->~XnnpackDelegateHandle();
     }
   }
 
@@ -228,6 +299,107 @@ class XnnpackBackend final
   }
 
  private:
+  Error execute_graph(XnnpackDelegateHandle* delegate, Span<EValue*> args)
+      const {
+    auto* executor = delegate->graph_executor;
+
+    // Build input tensors from EValue args.
+    std::vector<xnnpack::core::Tensor> inputs;
+    inputs.reserve(delegate->input_external_ids.size());
+    for (uint32_t ext_id : delegate->input_external_ids) {
+      ET_CHECK_OR_RETURN_ERROR(
+          ext_id < args.size(),
+          InvalidProgram,
+          "Input external id %u out of range (%zu args)",
+          ext_id,
+          args.size());
+      auto& et_tensor = args[ext_id]->toTensor();
+      xnnpack::core::Tensor t;
+      // The external-value dtype is taken from the serialized graph spec; this
+      // field is informational for the input wrapper. Defaulting to Float32
+      // matches the supported (float) input set.
+      t.dtype = xnnpack::core::DType::Float32;
+      if (et_tensor.dim() == 0) {
+        t.sizes = {1};
+      } else {
+        // Pass dims in physical (dim-order-permuted) layout so a channels-last
+        // input matches the NHWC layout XNNPACK expects, mirroring the legacy
+        // XNNExecutor path.
+        size_t num_dims = et_tensor.dim();
+        executorch::aten::DimOrderType
+            dim_order[::executorch::runtime::kTensorDimensionLimit];
+        ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order(
+            et_tensor, dim_order, num_dims));
+        t.sizes.resize(num_dims);
+        for (size_t d = 0; d < num_dims; d++) {
+          t.sizes[d] = static_cast<uint64_t>(
+              et_tensor.size(static_cast<int>(dim_order[d])));
+        }
+      }
+      t.storage.data = et_tensor.mutable_data_ptr();
+      t.storage.size_in_bytes = et_tensor.nbytes();
+      t.storage.owner = xnnpack::core::StorageOwner::External;
+      inputs.push_back(std::move(t));
+    }
+
+    ET_UNWRAP(outputs, executor->run({inputs.data(), inputs.size()}));
+
+    ET_CHECK_OR_RETURN_ERROR(
+        outputs.size() == delegate->output_external_ids.size(),
+        Internal,
+        "Executor produced %zu outputs, expected %zu",
+        outputs.size(),
+        delegate->output_external_ids.size());
+
+    // Copy output data back to EValue tensors.
+    for (size_t i = 0; i < delegate->output_external_ids.size(); i++) {
+      uint32_t ext_id = delegate->output_external_ids[i];
+      ET_CHECK_OR_RETURN_ERROR(
+          ext_id < args.size(),
+          InvalidProgram,
+          "Output external id %u out of range (%zu args)",
+          ext_id,
+          args.size());
+      auto& et_tensor = args[ext_id]->toTensor();
+      auto& out_tensor = outputs[i];
+
+      // Resize the output EValue tensor to match the computed shape. The
+      // executor reports dims in XNNPACK physical (channels-last) order;
+      // scatter them back to the tensor's logical order via its dim_order,
+      // mirroring the legacy XNNExecutor::resize_outputs path.
+      size_t num_dims = out_tensor.sizes.size();
+      std::vector<executorch::aten::SizesType> new_sizes_vec(num_dims);
+      executorch::aten::DimOrderType
+          out_dim_order[::executorch::runtime::kTensorDimensionLimit];
+      ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order(
+          et_tensor, out_dim_order, num_dims));
+      for (size_t d = 0; d < num_dims; d++) {
+        new_sizes_vec[out_dim_order[d]] =
+            static_cast<executorch::aten::SizesType>(out_tensor.sizes[d]);
+      }
+      executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes(
+          new_sizes_vec.data(), new_sizes_vec.size());
+      ET_CHECK_OK_OR_RETURN_ERROR(
+          executorch::runtime::resize_tensor(et_tensor, new_sizes));
+
+      if (out_tensor.storage.data != et_tensor.mutable_data_ptr()) {
+        ET_CHECK_OR_RETURN_ERROR(
+            out_tensor.storage.size_in_bytes <= et_tensor.nbytes(),
+            Internal,
+            "Output %zu is %zu bytes, exceeds tensor capacity %zu",
+            i,
+            out_tensor.storage.size_in_bytes,
+            et_tensor.nbytes());
+        std::memcpy(
+            et_tensor.mutable_data_ptr(),
+            out_tensor.storage.data,
+            out_tensor.storage.size_in_bytes);
+      }
+    }
+
+    return Error::Ok;
+  }
+
   mutable xnnpack::XnnpackBackendOptions options_;
 
   // Weights cache is global to all delegate instances.
diff --git a/backends/xnnpack/runtime/XNNPACKBackend.h b/backends/xnnpack/runtime/XNNPACKBackend.h
@@ -20,6 +20,10 @@ const char weight_cache_option_key[] = "weight_cache_enabled";
 // @lint-ignore CLANGTIDY facebook-hte-CArray
 const char packed_cache_path_option_key[] = "packed_cache_path";
 
+/// The key for the graph runtime option. When enabled, the new graph-based
+/// runtime is used instead of the legacy XNNCompiler/XNNExecutor path.
+const char use_graph_runtime_option_key[] = "use_graph_runtime";
+
 /// Workspace sharing mode. This is a backend option that can be set via the
 /// set_option API to control memory sharing between CALL_DELEGATE instances.
 /// This is useful for reducing memory consumption.
diff --git a/backends/xnnpack/runtime/XnnpackBackendOptions.cpp b/backends/xnnpack/runtime/XnnpackBackendOptions.cpp
@@ -43,6 +43,8 @@ Error XnnpackBackendOptions::get_option(BackendOption& option) const {
         std::min(packed_cache_path_.size(), runtime::kMaxOptionValueLength - 1);
     memcpy(arr.data(), packed_cache_path_.data(), len);
     option.value = arr;
+  } else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {
+    option.value = use_graph_runtime_.load();
   }
   return Error::Ok;
 }
@@ -84,6 +86,14 @@ Error XnnpackBackendOptions::set_option(const BackendOption& option) {
         Debug,
         "Setting XNNPACK packed cache path to %s.",
         packed_cache_path_.c_str());
+  } else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {
+    auto* val = std::get_if<bool>(&option.value);
+    if (!val) {
+      ET_LOG(Error, "XNNPACK use_graph_runtime must be a bool.");
+      return Error::InvalidArgument;
+    }
+    ET_LOG(Debug, "Setting XNNPACK use_graph_runtime to %d.", *val);
+    use_graph_runtime_.store(*val);
   }
   return Error::Ok;
 }
@@ -114,6 +124,12 @@ XnnpackBackendOptions::resolve_sharing_mode(
   return static_cast<WorkspaceSharingMode>(raw_mode);
 }
 
+bool XnnpackBackendOptions::resolve_graph_runtime(
+    const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const {
+  return resolve_option<bool>(
+      context, use_graph_runtime_option_key, use_graph_runtime_.load());
+}
+
 WorkspaceSharingMode XnnpackBackendOptions::get_sharing_mode() const {
   return sharing_mode_.load();
 }
diff --git a/backends/xnnpack/runtime/XnnpackBackendOptions.h b/backends/xnnpack/runtime/XnnpackBackendOptions.h
@@ -37,6 +37,9 @@ class XnnpackBackendOptions {
   runtime::Result<WorkspaceSharingMode> resolve_sharing_mode(
       const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const;
 
+  bool resolve_graph_runtime(
+      const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const;
+
   WorkspaceSharingMode get_sharing_mode() const;
   XNNWorkspaceManager& workspace_manager();
   const XNNWorkspaceManager& workspace_manager() const;
@@ -61,6 +64,7 @@ class XnnpackBackendOptions {
 #endif
 
   std::string packed_cache_path_;
+  std::atomic<bool> use_graph_runtime_{false};
 };
 
 } // namespace executorch::backends::xnnpack
diff --git a/backends/xnnpack/test/CMakeLists.txt b/backends/xnnpack/test/CMakeLists.txt
@@ -69,3 +69,22 @@ target_include_directories(
           ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
           ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
 )
+
+# Graph runtime E2E tests (requires XNNPACK runtime).
+et_cxx_test(
+  backends_xnnpack_graph_e2e_test
+  SOURCES
+  runtime/test_e2e.cpp
+  EXTRA_LIBS
+  xnnpack_backend
+  XNNPACK
+  pthreadpool
+  cpuinfo
+  xnnpack-microkernels-prod
+)
+target_include_directories(
+  backends_xnnpack_graph_e2e_test
+  PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/XNNPACK/include
+          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+          ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+)
diff --git a/backends/xnnpack/test/runtime/test_e2e.cpp b/backends/xnnpack/test/runtime/test_e2e.cpp

Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,8 @@ Error XnnpackBackendOptions::get_option(BackendOption& option) const {`
`43`	`43`	`std::min(packed_cache_path_.size(), runtime::kMaxOptionValueLength - 1);`
`44`	`44`	`memcpy(arr.data(), packed_cache_path_.data(), len);`
`45`	`45`	`option.value = arr;`
	`46`	`+ } else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {`
	`47`	`+ option.value = use_graph_runtime_.load();`
`46`	`48`	`}`
`47`	`49`	`return Error::Ok;`
`48`	`50`	`}`
`@@ -84,6 +86,14 @@ Error XnnpackBackendOptions::set_option(const BackendOption& option) {`
`84`	`86`	`Debug,`
`85`	`87`	`"Setting XNNPACK packed cache path to %s.",`
`86`	`88`	`packed_cache_path_.c_str());`
	`89`	`+ } else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {`
	`90`	`+ auto* val = std::get_if<bool>(&option.value);`
	`91`	`+ if (!val) {`
	`92`	`+ ET_LOG(Error, "XNNPACK use_graph_runtime must be a bool.");`
	`93`	`+ return Error::InvalidArgument;`
	`94`	`+ }`
	`95`	`+ ET_LOG(Debug, "Setting XNNPACK use_graph_runtime to %d.", *val);`
	`96`	`+ use_graph_runtime_.store(*val);`
`87`	`97`	`}`
`88`	`98`	`return Error::Ok;`
`89`	`99`	`}`
`@@ -114,6 +124,12 @@ XnnpackBackendOptions::resolve_sharing_mode(`
`114`	`124`	`return static_cast<WorkspaceSharingMode>(raw_mode);`
`115`	`125`	`}`
`116`	`126`
	`127`	`+bool XnnpackBackendOptions::resolve_graph_runtime(`
	`128`	`+ const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const {`
	`129`	`+ return resolve_option<bool>(`
	`130`	`+ context, use_graph_runtime_option_key, use_graph_runtime_.load());`
	`131`	`+}`
	`132`	`+`
`117`	`133`	`WorkspaceSharingMode XnnpackBackendOptions::get_sharing_mode() const {`
`118`	`134`	`return sharing_mode_.load();`
`119`	`135`	`}`