Skip to content

Commit bfcfd5d

Browse files
committed
XNNPACK graph runtime: backend integration behind use_graph_runtime
Wires the graph runtime into XnnpackBackend behind the use_graph_runtime backend option (default off, with per-method runtime-spec override). When enabled, init() deserializes via FlatbufferGraphBuilder and builds an Executor; execute()/destroy() gain the graph path alongside the legacy XNNCompiler path. With prefer_in_tree_kernel disabled, existing all-XNNPACK models run end-to-end on the new executor. Authored with Claude. ghstack-source-id: 06e4862 ghstack-comment-id: 4695613250 Pull-Request: #20256
1 parent 389dcb4 commit bfcfd5d

6 files changed

Lines changed: 1199 additions & 9 deletions

File tree

backends/xnnpack/runtime/XNNPACKBackend.cpp

Lines changed: 181 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,25 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/backends/xnnpack/runtime/FlatbufferGraphBuilder.h>
910
#include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
1011
#include <executorch/backends/xnnpack/runtime/XNNPACKBackend.h>
1112
#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
1213
#include <executorch/backends/xnnpack/runtime/XNNWorkspace.h>
1314
#include <executorch/backends/xnnpack/runtime/XnnpackBackendOptions.h>
15+
#include <executorch/backends/xnnpack/runtime/executor/executor.h>
1416
#include <executorch/runtime/backend/interface.h>
1517
#include <executorch/runtime/core/error.h>
1618
#include <executorch/runtime/core/evalue.h>
19+
#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
1720
#include <executorch/runtime/executor/pte_data_map.h>
21+
#include <executorch/runtime/platform/log.h>
22+
#include <chrono>
1823

24+
#include <cstring>
1925
#include <memory>
2026
#include <mutex>
27+
#include <vector>
2128

2229
#pragma clang diagnostic ignored "-Wglobal-constructors"
2330

@@ -41,6 +48,16 @@ using executorch::runtime::FreeableBuffer;
4148
using executorch::runtime::Result;
4249
using executorch::runtime::Span;
4350

51+
struct XnnpackDelegateHandle {
52+
bool is_graph_runtime = false;
53+
// Legacy path: XNNExecutor placed via runtime allocator.
54+
xnnpack::delegate::XNNExecutor* legacy_executor = nullptr;
55+
// Graph path: heap-allocated Executor.
56+
xnnpack::executor::Executor* graph_executor = nullptr;
57+
std::vector<uint32_t> input_external_ids;
58+
std::vector<uint32_t> output_external_ids;
59+
};
60+
4461
class XnnpackBackend final
4562
: public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface {
4663
public:
@@ -66,26 +83,67 @@ class XnnpackBackend final
6683
BackendInitContext& context,
6784
FreeableBuffer* processed,
6885
ArrayRef<CompileSpec> compile_specs) const override {
86+
auto* handle = context.get_runtime_allocator()
87+
->allocateInstance<XnnpackDelegateHandle>();
88+
if (handle == nullptr) {
89+
return Error::MemoryAllocationFailed;
90+
}
91+
new (handle) XnnpackDelegateHandle();
92+
93+
bool use_graph_runtime = options_.resolve_graph_runtime(context);
94+
handle->is_graph_runtime = use_graph_runtime;
95+
96+
if (use_graph_runtime) {
97+
auto t0 = std::chrono::steady_clock::now();
98+
const NamedDataMap* named_data_map = context.get_named_data_map();
99+
ET_UNWRAP(
100+
result,
101+
xnnpack::FlatbufferGraphBuilder::build(
102+
processed->data(), processed->size(), named_data_map));
103+
processed->Free();
104+
auto t1 = std::chrono::steady_clock::now();
105+
106+
ET_UNWRAP(
107+
built_executor, xnnpack::executor::Executor::build(result.graph));
108+
auto* executor =
109+
new xnnpack::executor::Executor(std::move(built_executor));
110+
auto t2 = std::chrono::steady_clock::now();
111+
handle->graph_executor = executor;
112+
handle->input_external_ids = std::move(result.input_external_ids);
113+
handle->output_external_ids = std::move(result.output_external_ids);
114+
ET_LOG(
115+
Info,
116+
"Graph runtime init: deserialize=%lldms executor_build=%lldms",
117+
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(
118+
t1 - t0)
119+
.count(),
120+
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(
121+
t2 - t1)
122+
.count());
123+
return handle;
124+
}
125+
69126
auto executor = context.get_runtime_allocator()
70127
->allocateInstance<xnnpack::delegate::XNNExecutor>();
71128
if (executor == nullptr) {
129+
handle->~XnnpackDelegateHandle();
72130
return Error::MemoryAllocationFailed;
73131
}
74132

75133
const NamedDataMap* named_data_map = context.get_named_data_map();
76-
// thread safe. This can happen when multiple threads call init() on
77-
// the same backend instance.
78134

79135
auto program_id =
80136
reinterpret_cast<uintptr_t>(context.get_runtime_allocator());
81137
auto sharing_mode_result = options_.resolve_sharing_mode(context);
82138
if (!sharing_mode_result.ok()) {
139+
handle->~XnnpackDelegateHandle();
83140
return sharing_mode_result.error();
84141
}
85142
auto workspace_result =
86143
options_.workspace_manager().get_or_create_workspace(
87144
program_id, sharing_mode_result.get());
88145
if (!workspace_result.ok()) {
146+
handle->~XnnpackDelegateHandle();
89147
return workspace_result.error();
90148
}
91149
auto workspace = workspace_result.get();
@@ -128,23 +186,27 @@ class XnnpackBackend final
128186
processed->Free();
129187

130188
if (err != Error::Ok) {
131-
// destroy() won't be called on this handle, so we need to clean it up
132-
// now.
133189
executor->~XNNExecutor();
134-
190+
handle->~XnnpackDelegateHandle();
135191
ET_LOG(
136192
Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err);
137193
return err;
138194
}
139-
140-
return executor;
195+
handle->legacy_executor = executor;
196+
return handle;
141197
}
142198

143199
Error execute(
144200
BackendExecutionContext& context,
145201
DelegateHandle* handle,
146202
Span<EValue*> args) const override {
147-
auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
203+
auto* delegate = static_cast<XnnpackDelegateHandle*>(handle);
204+
205+
if (delegate->is_graph_runtime) {
206+
return execute_graph(delegate, args);
207+
}
208+
209+
auto executor = delegate->legacy_executor;
148210

149211
auto workspace = executor->get_workspace();
150212

@@ -176,7 +238,15 @@ class XnnpackBackend final
176238

177239
void destroy(DelegateHandle* handle) const override {
178240
if (handle != nullptr) {
179-
auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
241+
auto* delegate = static_cast<XnnpackDelegateHandle*>(handle);
242+
243+
if (delegate->is_graph_runtime) {
244+
delete delegate->graph_executor;
245+
delegate->~XnnpackDelegateHandle();
246+
return;
247+
}
248+
249+
auto executor = delegate->legacy_executor;
180250
auto workspace = executor->get_workspace();
181251

182252
const std::lock_guard<std::mutex> lock_weights_cache(
@@ -200,6 +270,7 @@ class XnnpackBackend final
200270
// XNNExecutor is not trivially destructible. Since this was constructed
201271
// manually in init(), we must destroy it manually here.
202272
executor->~XNNExecutor();
273+
delegate->~XnnpackDelegateHandle();
203274
}
204275
}
205276

@@ -228,6 +299,107 @@ class XnnpackBackend final
228299
}
229300

230301
private:
302+
Error execute_graph(XnnpackDelegateHandle* delegate, Span<EValue*> args)
303+
const {
304+
auto* executor = delegate->graph_executor;
305+
306+
// Build input tensors from EValue args.
307+
std::vector<xnnpack::core::Tensor> inputs;
308+
inputs.reserve(delegate->input_external_ids.size());
309+
for (uint32_t ext_id : delegate->input_external_ids) {
310+
ET_CHECK_OR_RETURN_ERROR(
311+
ext_id < args.size(),
312+
InvalidProgram,
313+
"Input external id %u out of range (%zu args)",
314+
ext_id,
315+
args.size());
316+
auto& et_tensor = args[ext_id]->toTensor();
317+
xnnpack::core::Tensor t;
318+
// The external-value dtype is taken from the serialized graph spec; this
319+
// field is informational for the input wrapper. Defaulting to Float32
320+
// matches the supported (float) input set.
321+
t.dtype = xnnpack::core::DType::Float32;
322+
if (et_tensor.dim() == 0) {
323+
t.sizes = {1};
324+
} else {
325+
// Pass dims in physical (dim-order-permuted) layout so a channels-last
326+
// input matches the NHWC layout XNNPACK expects, mirroring the legacy
327+
// XNNExecutor path.
328+
size_t num_dims = et_tensor.dim();
329+
executorch::aten::DimOrderType
330+
dim_order[::executorch::runtime::kTensorDimensionLimit];
331+
ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order(
332+
et_tensor, dim_order, num_dims));
333+
t.sizes.resize(num_dims);
334+
for (size_t d = 0; d < num_dims; d++) {
335+
t.sizes[d] = static_cast<uint64_t>(
336+
et_tensor.size(static_cast<int>(dim_order[d])));
337+
}
338+
}
339+
t.storage.data = et_tensor.mutable_data_ptr();
340+
t.storage.size_in_bytes = et_tensor.nbytes();
341+
t.storage.owner = xnnpack::core::StorageOwner::External;
342+
inputs.push_back(std::move(t));
343+
}
344+
345+
ET_UNWRAP(outputs, executor->run({inputs.data(), inputs.size()}));
346+
347+
ET_CHECK_OR_RETURN_ERROR(
348+
outputs.size() == delegate->output_external_ids.size(),
349+
Internal,
350+
"Executor produced %zu outputs, expected %zu",
351+
outputs.size(),
352+
delegate->output_external_ids.size());
353+
354+
// Copy output data back to EValue tensors.
355+
for (size_t i = 0; i < delegate->output_external_ids.size(); i++) {
356+
uint32_t ext_id = delegate->output_external_ids[i];
357+
ET_CHECK_OR_RETURN_ERROR(
358+
ext_id < args.size(),
359+
InvalidProgram,
360+
"Output external id %u out of range (%zu args)",
361+
ext_id,
362+
args.size());
363+
auto& et_tensor = args[ext_id]->toTensor();
364+
auto& out_tensor = outputs[i];
365+
366+
// Resize the output EValue tensor to match the computed shape. The
367+
// executor reports dims in XNNPACK physical (channels-last) order;
368+
// scatter them back to the tensor's logical order via its dim_order,
369+
// mirroring the legacy XNNExecutor::resize_outputs path.
370+
size_t num_dims = out_tensor.sizes.size();
371+
std::vector<executorch::aten::SizesType> new_sizes_vec(num_dims);
372+
executorch::aten::DimOrderType
373+
out_dim_order[::executorch::runtime::kTensorDimensionLimit];
374+
ET_CHECK_OK_OR_RETURN_ERROR(ET_RUNTIME_NAMESPACE::get_dim_order(
375+
et_tensor, out_dim_order, num_dims));
376+
for (size_t d = 0; d < num_dims; d++) {
377+
new_sizes_vec[out_dim_order[d]] =
378+
static_cast<executorch::aten::SizesType>(out_tensor.sizes[d]);
379+
}
380+
executorch::aten::ArrayRef<executorch::aten::SizesType> new_sizes(
381+
new_sizes_vec.data(), new_sizes_vec.size());
382+
ET_CHECK_OK_OR_RETURN_ERROR(
383+
executorch::runtime::resize_tensor(et_tensor, new_sizes));
384+
385+
if (out_tensor.storage.data != et_tensor.mutable_data_ptr()) {
386+
ET_CHECK_OR_RETURN_ERROR(
387+
out_tensor.storage.size_in_bytes <= et_tensor.nbytes(),
388+
Internal,
389+
"Output %zu is %zu bytes, exceeds tensor capacity %zu",
390+
i,
391+
out_tensor.storage.size_in_bytes,
392+
et_tensor.nbytes());
393+
std::memcpy(
394+
et_tensor.mutable_data_ptr(),
395+
out_tensor.storage.data,
396+
out_tensor.storage.size_in_bytes);
397+
}
398+
}
399+
400+
return Error::Ok;
401+
}
402+
231403
mutable xnnpack::XnnpackBackendOptions options_;
232404

233405
// Weights cache is global to all delegate instances.

backends/xnnpack/runtime/XNNPACKBackend.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@ const char weight_cache_option_key[] = "weight_cache_enabled";
2020
// @lint-ignore CLANGTIDY facebook-hte-CArray
2121
const char packed_cache_path_option_key[] = "packed_cache_path";
2222

23+
/// The key for the graph runtime option. When enabled, the new graph-based
24+
/// runtime is used instead of the legacy XNNCompiler/XNNExecutor path.
25+
const char use_graph_runtime_option_key[] = "use_graph_runtime";
26+
2327
/// Workspace sharing mode. This is a backend option that can be set via the
2428
/// set_option API to control memory sharing between CALL_DELEGATE instances.
2529
/// This is useful for reducing memory consumption.

backends/xnnpack/runtime/XnnpackBackendOptions.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ Error XnnpackBackendOptions::get_option(BackendOption& option) const {
4343
std::min(packed_cache_path_.size(), runtime::kMaxOptionValueLength - 1);
4444
memcpy(arr.data(), packed_cache_path_.data(), len);
4545
option.value = arr;
46+
} else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {
47+
option.value = use_graph_runtime_.load();
4648
}
4749
return Error::Ok;
4850
}
@@ -84,6 +86,14 @@ Error XnnpackBackendOptions::set_option(const BackendOption& option) {
8486
Debug,
8587
"Setting XNNPACK packed cache path to %s.",
8688
packed_cache_path_.c_str());
89+
} else if (strcmp(option.key, use_graph_runtime_option_key) == 0) {
90+
auto* val = std::get_if<bool>(&option.value);
91+
if (!val) {
92+
ET_LOG(Error, "XNNPACK use_graph_runtime must be a bool.");
93+
return Error::InvalidArgument;
94+
}
95+
ET_LOG(Debug, "Setting XNNPACK use_graph_runtime to %d.", *val);
96+
use_graph_runtime_.store(*val);
8797
}
8898
return Error::Ok;
8999
}
@@ -114,6 +124,12 @@ XnnpackBackendOptions::resolve_sharing_mode(
114124
return static_cast<WorkspaceSharingMode>(raw_mode);
115125
}
116126

127+
bool XnnpackBackendOptions::resolve_graph_runtime(
128+
const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const {
129+
return resolve_option<bool>(
130+
context, use_graph_runtime_option_key, use_graph_runtime_.load());
131+
}
132+
117133
WorkspaceSharingMode XnnpackBackendOptions::get_sharing_mode() const {
118134
return sharing_mode_.load();
119135
}

backends/xnnpack/runtime/XnnpackBackendOptions.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ class XnnpackBackendOptions {
3737
runtime::Result<WorkspaceSharingMode> resolve_sharing_mode(
3838
const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const;
3939

40+
bool resolve_graph_runtime(
41+
const ET_RUNTIME_NAMESPACE::BackendInitContext& context) const;
42+
4043
WorkspaceSharingMode get_sharing_mode() const;
4144
XNNWorkspaceManager& workspace_manager();
4245
const XNNWorkspaceManager& workspace_manager() const;
@@ -61,6 +64,7 @@ class XnnpackBackendOptions {
6164
#endif
6265

6366
std::string packed_cache_path_;
67+
std::atomic<bool> use_graph_runtime_{false};
6468
};
6569

6670
} // namespace executorch::backends::xnnpack

backends/xnnpack/test/CMakeLists.txt

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,22 @@ target_include_directories(
6969
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
7070
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
7171
)
72+
73+
# Graph runtime E2E tests (requires XNNPACK runtime).
74+
et_cxx_test(
75+
backends_xnnpack_graph_e2e_test
76+
SOURCES
77+
runtime/test_e2e.cpp
78+
EXTRA_LIBS
79+
xnnpack_backend
80+
XNNPACK
81+
pthreadpool
82+
cpuinfo
83+
xnnpack-microkernels-prod
84+
)
85+
target_include_directories(
86+
backends_xnnpack_graph_e2e_test
87+
PRIVATE ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/XNNPACK/include
88+
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
89+
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
90+
)

0 commit comments

Comments
 (0)