66 * LICENSE file in the root directory of this source tree.
77 */
88
9+ #include < executorch/backends/xnnpack/runtime/FlatbufferGraphBuilder.h>
910#include < executorch/backends/xnnpack/runtime/XNNCompiler.h>
1011#include < executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
1112#include < executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
1213#include < executorch/backends/xnnpack/runtime/XNNWorkspace.h>
1314#include < executorch/backends/xnnpack/runtime/XnnpackBackendOptions.h>
15+ #include < executorch/backends/xnnpack/runtime/executor/executor.h>
1416#include < executorch/runtime/backend/interface.h>
1517#include < executorch/runtime/core/error.h>
1618#include < executorch/runtime/core/evalue.h>
19+ #include < executorch/runtime/core/exec_aten/util/tensor_util.h>
1720#include < executorch/runtime/executor/pte_data_map.h>
21+ #include < executorch/runtime/platform/log.h>
22+ #include < chrono>
1823
24+ #include < cstring>
1925#include < memory>
2026#include < mutex>
27+ #include < vector>
2128
2229#pragma clang diagnostic ignored "-Wglobal-constructors"
2330
@@ -41,6 +48,16 @@ using executorch::runtime::FreeableBuffer;
4148using executorch::runtime::Result;
4249using executorch::runtime::Span;
4350
51+ struct XnnpackDelegateHandle {
52+ bool is_graph_runtime = false ;
53+ // Legacy path: XNNExecutor placed via runtime allocator.
54+ xnnpack::delegate::XNNExecutor* legacy_executor = nullptr ;
55+ // Graph path: heap-allocated Executor.
56+ xnnpack::executor::Executor* graph_executor = nullptr ;
57+ std::vector<uint32_t > input_external_ids;
58+ std::vector<uint32_t > output_external_ids;
59+ };
60+
4461class XnnpackBackend final
4562 : public ::executorch::ET_RUNTIME_NAMESPACE ::BackendInterface {
4663 public:
@@ -66,26 +83,67 @@ class XnnpackBackend final
6683 BackendInitContext& context,
6784 FreeableBuffer* processed,
6885 ArrayRef<CompileSpec> compile_specs) const override {
86+ auto * handle = context.get_runtime_allocator ()
87+ ->allocateInstance <XnnpackDelegateHandle>();
88+ if (handle == nullptr ) {
89+ return Error::MemoryAllocationFailed;
90+ }
91+ new (handle) XnnpackDelegateHandle ();
92+
93+ bool use_graph_runtime = options_.resolve_graph_runtime (context);
94+ handle->is_graph_runtime = use_graph_runtime;
95+
96+ if (use_graph_runtime) {
97+ auto t0 = std::chrono::steady_clock::now ();
98+ const NamedDataMap* named_data_map = context.get_named_data_map ();
99+ ET_UNWRAP (
100+ result,
101+ xnnpack::FlatbufferGraphBuilder::build (
102+ processed->data (), processed->size (), named_data_map));
103+ processed->Free ();
104+ auto t1 = std::chrono::steady_clock::now ();
105+
106+ ET_UNWRAP (
107+ built_executor, xnnpack::executor::Executor::build (result.graph ));
108+ auto * executor =
109+ new xnnpack::executor::Executor (std::move (built_executor));
110+ auto t2 = std::chrono::steady_clock::now ();
111+ handle->graph_executor = executor;
112+ handle->input_external_ids = std::move (result.input_external_ids );
113+ handle->output_external_ids = std::move (result.output_external_ids );
114+ ET_LOG (
115+ Info,
116+ " Graph runtime init: deserialize=%lldms executor_build=%lldms" ,
117+ (long long )std::chrono::duration_cast<std::chrono::milliseconds>(
118+ t1 - t0)
119+ .count (),
120+ (long long )std::chrono::duration_cast<std::chrono::milliseconds>(
121+ t2 - t1)
122+ .count ());
123+ return handle;
124+ }
125+
69126 auto executor = context.get_runtime_allocator ()
70127 ->allocateInstance <xnnpack::delegate::XNNExecutor>();
71128 if (executor == nullptr ) {
129+ handle->~XnnpackDelegateHandle ();
72130 return Error::MemoryAllocationFailed;
73131 }
74132
75133 const NamedDataMap* named_data_map = context.get_named_data_map ();
76- // thread safe. This can happen when multiple threads call init() on
77- // the same backend instance.
78134
79135 auto program_id =
80136 reinterpret_cast <uintptr_t >(context.get_runtime_allocator ());
81137 auto sharing_mode_result = options_.resolve_sharing_mode (context);
82138 if (!sharing_mode_result.ok ()) {
139+ handle->~XnnpackDelegateHandle ();
83140 return sharing_mode_result.error ();
84141 }
85142 auto workspace_result =
86143 options_.workspace_manager ().get_or_create_workspace (
87144 program_id, sharing_mode_result.get ());
88145 if (!workspace_result.ok ()) {
146+ handle->~XnnpackDelegateHandle ();
89147 return workspace_result.error ();
90148 }
91149 auto workspace = workspace_result.get ();
@@ -128,23 +186,27 @@ class XnnpackBackend final
128186 processed->Free ();
129187
130188 if (err != Error::Ok) {
131- // destroy() won't be called on this handle, so we need to clean it up
132- // now.
133189 executor->~XNNExecutor ();
134-
190+ handle-> ~XnnpackDelegateHandle ();
135191 ET_LOG (
136192 Error, " XNNCompiler::compileModel failed: 0x%x" , (unsigned int )err);
137193 return err;
138194 }
139-
140- return executor ;
195+ handle-> legacy_executor = executor;
196+ return handle ;
141197 }
142198
143199 Error execute (
144200 BackendExecutionContext& context,
145201 DelegateHandle* handle,
146202 Span<EValue*> args) const override {
147- auto executor = static_cast <xnnpack::delegate::XNNExecutor*>(handle);
203+ auto * delegate = static_cast <XnnpackDelegateHandle*>(handle);
204+
205+ if (delegate->is_graph_runtime ) {
206+ return execute_graph (delegate, args);
207+ }
208+
209+ auto executor = delegate->legacy_executor ;
148210
149211 auto workspace = executor->get_workspace ();
150212
@@ -176,7 +238,15 @@ class XnnpackBackend final
176238
177239 void destroy (DelegateHandle* handle) const override {
178240 if (handle != nullptr ) {
179- auto executor = static_cast <xnnpack::delegate::XNNExecutor*>(handle);
241+ auto * delegate = static_cast <XnnpackDelegateHandle*>(handle);
242+
243+ if (delegate->is_graph_runtime ) {
244+ delete delegate->graph_executor ;
245+ delegate->~XnnpackDelegateHandle ();
246+ return ;
247+ }
248+
249+ auto executor = delegate->legacy_executor ;
180250 auto workspace = executor->get_workspace ();
181251
182252 const std::lock_guard<std::mutex> lock_weights_cache (
@@ -200,6 +270,7 @@ class XnnpackBackend final
200270 // XNNExecutor is not trivially destructible. Since this was constructed
201271 // manually in init(), we must destroy it manually here.
202272 executor->~XNNExecutor ();
273+ delegate->~XnnpackDelegateHandle ();
203274 }
204275 }
205276
@@ -228,6 +299,107 @@ class XnnpackBackend final
228299 }
229300
230301 private:
302+ Error execute_graph (XnnpackDelegateHandle* delegate, Span<EValue*> args)
303+ const {
304+ auto * executor = delegate->graph_executor ;
305+
306+ // Build input tensors from EValue args.
307+ std::vector<xnnpack::core::Tensor> inputs;
308+ inputs.reserve (delegate->input_external_ids .size ());
309+ for (uint32_t ext_id : delegate->input_external_ids ) {
310+ ET_CHECK_OR_RETURN_ERROR (
311+ ext_id < args.size (),
312+ InvalidProgram,
313+ " Input external id %u out of range (%zu args)" ,
314+ ext_id,
315+ args.size ());
316+ auto & et_tensor = args[ext_id]->toTensor ();
317+ xnnpack::core::Tensor t;
318+ // The external-value dtype is taken from the serialized graph spec; this
319+ // field is informational for the input wrapper. Defaulting to Float32
320+ // matches the supported (float) input set.
321+ t.dtype = xnnpack::core::DType::Float32;
322+ if (et_tensor.dim () == 0 ) {
323+ t.sizes = {1 };
324+ } else {
325+ // Pass dims in physical (dim-order-permuted) layout so a channels-last
326+ // input matches the NHWC layout XNNPACK expects, mirroring the legacy
327+ // XNNExecutor path.
328+ size_t num_dims = et_tensor.dim ();
329+ executorch::aten::DimOrderType
330+ dim_order[::executorch::runtime::kTensorDimensionLimit ];
331+ ET_CHECK_OK_OR_RETURN_ERROR (ET_RUNTIME_NAMESPACE::get_dim_order (
332+ et_tensor, dim_order, num_dims));
333+ t.sizes .resize (num_dims);
334+ for (size_t d = 0 ; d < num_dims; d++) {
335+ t.sizes [d] = static_cast <uint64_t >(
336+ et_tensor.size (static_cast <int >(dim_order[d])));
337+ }
338+ }
339+ t.storage .data = et_tensor.mutable_data_ptr ();
340+ t.storage .size_in_bytes = et_tensor.nbytes ();
341+ t.storage .owner = xnnpack::core::StorageOwner::External;
342+ inputs.push_back (std::move (t));
343+ }
344+
345+ ET_UNWRAP (outputs, executor->run ({inputs.data (), inputs.size ()}));
346+
347+ ET_CHECK_OR_RETURN_ERROR (
348+ outputs.size () == delegate->output_external_ids .size (),
349+ Internal,
350+ " Executor produced %zu outputs, expected %zu" ,
351+ outputs.size (),
352+ delegate->output_external_ids .size ());
353+
354+ // Copy output data back to EValue tensors.
355+ for (size_t i = 0 ; i < delegate->output_external_ids .size (); i++) {
356+ uint32_t ext_id = delegate->output_external_ids [i];
357+ ET_CHECK_OR_RETURN_ERROR (
358+ ext_id < args.size (),
359+ InvalidProgram,
360+ " Output external id %u out of range (%zu args)" ,
361+ ext_id,
362+ args.size ());
363+ auto & et_tensor = args[ext_id]->toTensor ();
364+ auto & out_tensor = outputs[i];
365+
366+ // Resize the output EValue tensor to match the computed shape. The
367+ // executor reports dims in XNNPACK physical (channels-last) order;
368+ // scatter them back to the tensor's logical order via its dim_order,
369+ // mirroring the legacy XNNExecutor::resize_outputs path.
370+ size_t num_dims = out_tensor.sizes .size ();
371+ std::vector<executorch::aten::SizesType> new_sizes_vec (num_dims);
372+ executorch::aten::DimOrderType
373+ out_dim_order[::executorch::runtime::kTensorDimensionLimit ];
374+ ET_CHECK_OK_OR_RETURN_ERROR (ET_RUNTIME_NAMESPACE::get_dim_order (
375+ et_tensor, out_dim_order, num_dims));
376+ for (size_t d = 0 ; d < num_dims; d++) {
377+ new_sizes_vec[out_dim_order[d]] =
378+ static_cast <executorch::aten::SizesType>(out_tensor.sizes [d]);
379+ }
380+ executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes (
381+ new_sizes_vec.data (), new_sizes_vec.size ());
382+ ET_CHECK_OK_OR_RETURN_ERROR (
383+ executorch::runtime::resize_tensor (et_tensor, new_sizes));
384+
385+ if (out_tensor.storage .data != et_tensor.mutable_data_ptr ()) {
386+ ET_CHECK_OR_RETURN_ERROR (
387+ out_tensor.storage .size_in_bytes <= et_tensor.nbytes (),
388+ Internal,
389+ " Output %zu is %zu bytes, exceeds tensor capacity %zu" ,
390+ i,
391+ out_tensor.storage .size_in_bytes ,
392+ et_tensor.nbytes ());
393+ std::memcpy (
394+ et_tensor.mutable_data_ptr (),
395+ out_tensor.storage .data ,
396+ out_tensor.storage .size_in_bytes );
397+ }
398+ }
399+
400+ return Error::Ok;
401+ }
402+
231403 mutable xnnpack::XnnpackBackendOptions options_;
232404
233405 // Weights cache is global to all delegate instances.
0 commit comments