Skip to content

Commit 629a274

Browse files
committed
Update
[ghstack-poisoned]
1 parent e39b785 commit 629a274

3 files changed

Lines changed: 380 additions & 0 deletions

File tree

backends/xnnpack/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ list(
117117
backends/xnnpack/runtime/plan/schedule.cpp
118118
backends/xnnpack/runtime/plan/execution_plan.cpp
119119
backends/xnnpack/runtime/plan/memory_plan.cpp
120+
backends/xnnpack/runtime/executor/executor.cpp
120121
)
121122

122123
list(TRANSFORM _xnnpack_backend__srcs PREPEND "${EXECUTORCH_ROOT}/")
Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
#include <executorch/backends/xnnpack/runtime/executor/executor.h>
2+
3+
#include <executorch/backends/xnnpack/runtime/core/quant_params.h>
4+
#include <executorch/backends/xnnpack/runtime/core/variant_util.h>
5+
#include <executorch/backends/xnnpack/runtime/plan/execution_plan.h>
6+
7+
#include <executorch/runtime/platform/log.h>
8+
9+
#include <xnnpack.h>
10+
#include <cassert>
11+
#include <chrono>
12+
#include <cstdlib>
13+
14+
namespace executorch::backends::xnnpack::executor {
15+
16+
using executorch::runtime::Span;
17+
18+
runtime::Result<std::vector<core::Tensor>> Executor::run(
19+
Span<core::Tensor> inputs) {
20+
auto t_mem_start = std::chrono::steady_clock::now();
21+
ET_CHECK_OK_OR_RETURN_ERROR(update_planned_memory(inputs));
22+
auto t_mem_end = std::chrono::steady_clock::now();
23+
ET_LOG(
24+
Info,
25+
"update_planned_memory %lldus",
26+
(long long)std::chrono::duration_cast<std::chrono::microseconds>(
27+
t_mem_end - t_mem_start)
28+
.count());
29+
30+
for (size_t si = 0; si < plan.steps.size(); si++) {
31+
ET_CHECK_OK_OR_RETURN_ERROR(run_step(si, plan.steps[si]));
32+
}
33+
34+
std::vector<core::Tensor> outputs;
35+
outputs.reserve(output_slots.size());
36+
for (auto slot : output_slots) {
37+
auto& val = values[slot];
38+
core::Tensor t;
39+
t.dtype = val.dtype;
40+
t.sizes = val.sizes;
41+
t.storage.data = val.storage.data;
42+
t.storage.size_in_bytes = val.storage.size_in_bytes;
43+
t.storage.owner = core::StorageOwner::External;
44+
outputs.push_back(std::move(t));
45+
}
46+
return outputs;
47+
}
48+
49+
runtime::Error Executor::run_step(size_t step_idx, const plan::PlanStep& step) {
50+
runtime::Error err = runtime::Error::Ok;
51+
std::visit(
52+
overloaded{
53+
[&](const plan::RunOperatorStep& s) {
54+
std::vector<core::Tensor*> inputs;
55+
inputs.reserve(s.input_slots.size());
56+
for (auto slot : s.input_slots) {
57+
inputs.push_back(&values[slot]);
58+
}
59+
60+
std::vector<core::Tensor*> outputs;
61+
outputs.reserve(s.output_slots.size());
62+
for (auto slot : s.output_slots) {
63+
outputs.push_back(&values[slot]);
64+
}
65+
66+
auto t0 = std::chrono::steady_clock::now();
67+
s.op->execute(
68+
{inputs.data(), inputs.size()},
69+
{outputs.data(), outputs.size()});
70+
auto t1 = std::chrono::steady_clock::now();
71+
auto us =
72+
std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0)
73+
.count();
74+
ET_LOG(Info, "OpStep[%zu] %lldus", step_idx, (long long)us);
75+
},
76+
[&](const plan::RunXnnSubgraphStep& s) {
77+
auto t0 = std::chrono::steady_clock::now();
78+
err = setup_xnn_step(s);
79+
if (err != runtime::Error::Ok) {
80+
return;
81+
}
82+
auto t1 = std::chrono::steady_clock::now();
83+
auto status = xnn_invoke_runtime(s.runtime.get());
84+
if (status != xnn_status_success) {
85+
ET_LOG(
86+
Error,
87+
"xnn_invoke_runtime failed: 0x%x",
88+
(unsigned int)status);
89+
err = runtime::Error::Internal;
90+
return;
91+
}
92+
auto t2 = std::chrono::steady_clock::now();
93+
err = update_xnn_output_shapes(s);
94+
if (err != runtime::Error::Ok) {
95+
return;
96+
}
97+
auto setup_us =
98+
std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0)
99+
.count();
100+
auto invoke_us =
101+
std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1)
102+
.count();
103+
ET_LOG(
104+
Info,
105+
"XnnStep setup=%lldus invoke=%lldus",
106+
(long long)setup_us,
107+
(long long)invoke_us);
108+
}},
109+
step);
110+
return err;
111+
}
112+
113+
runtime::Error Executor::setup_xnn_step(const plan::RunXnnSubgraphStep& xnn) {
114+
auto* rt = xnn.runtime.get();
115+
116+
for (uint32_t eid = 0; eid < xnn.num_external_inputs; eid++) {
117+
auto slot = xnn.external_value_slots[eid];
118+
auto& tensor = values[slot];
119+
std::vector<size_t> dims(tensor.sizes.begin(), tensor.sizes.end());
120+
auto status = xnn_reshape_external_value(rt, eid, dims.size(), dims.data());
121+
ET_CHECK_OR_RETURN_ERROR(
122+
status == xnn_status_success,
123+
Internal,
124+
"xnn_reshape_external_value failed: 0x%x",
125+
(unsigned int)status);
126+
}
127+
128+
ET_CHECK_OR_RETURN_ERROR(
129+
xnn_reshape_runtime(rt) == xnn_status_success,
130+
Internal,
131+
"xnn_reshape_runtime failed");
132+
133+
std::vector<xnn_external_value> externals(xnn.external_value_slots.size());
134+
for (uint32_t eid = 0; eid < xnn.external_value_slots.size(); eid++) {
135+
auto slot = xnn.external_value_slots[eid];
136+
externals[eid].id = eid;
137+
externals[eid].data = values[slot].storage.data;
138+
}
139+
ET_CHECK_OR_RETURN_ERROR(
140+
xnn_setup_runtime_v2(rt, externals.size(), externals.data()) ==
141+
xnn_status_success,
142+
Internal,
143+
"xnn_setup_runtime_v2 failed");
144+
return runtime::Error::Ok;
145+
}
146+
147+
runtime::Error Executor::update_xnn_output_shapes(
148+
const plan::RunXnnSubgraphStep& xnn) {
149+
auto* rt = xnn.runtime.get();
150+
151+
for (uint32_t eid = 0; eid < xnn.external_value_slots.size(); eid++) {
152+
auto slot = xnn.external_value_slots[eid];
153+
size_t num_dims = 0;
154+
size_t dims[XNN_MAX_TENSOR_DIMS];
155+
ET_CHECK_OR_RETURN_ERROR(
156+
xnn_get_external_value_shape(rt, eid, &num_dims, dims) ==
157+
xnn_status_success,
158+
Internal,
159+
"xnn_get_external_value_shape failed");
160+
161+
values[slot].sizes.resize(num_dims);
162+
for (size_t d = 0; d < num_dims; d++) {
163+
values[slot].sizes[d] = dims[d];
164+
}
165+
size_t nbytes = core::byte_stride(values[slot].dtype);
166+
for (size_t d = 0; d < num_dims; d++) {
167+
nbytes *= dims[d];
168+
}
169+
values[slot].storage.size_in_bytes = nbytes;
170+
}
171+
return runtime::Error::Ok;
172+
}
173+
174+
runtime::Error Executor::update_planned_memory(Span<core::Tensor> inputs) {
175+
ET_CHECK_OK_OR_RETURN_ERROR(
176+
shape_env.specialize({input_specs.data(), input_specs.size()}, inputs));
177+
178+
ET_CHECK_OK_OR_RETURN_ERROR(memory_plan.replan(shape_env));
179+
ET_CHECK_OK_OR_RETURN_ERROR(arena.resize(memory_plan.arena_size));
180+
181+
for (size_t i = 0; i < inputs.size(); i++) {
182+
auto slot = input_slots[i];
183+
values[slot].sizes = inputs[i].sizes;
184+
values[slot].storage.data = inputs[i].storage.data;
185+
values[slot].storage.size_in_bytes = inputs[i].storage.size_in_bytes;
186+
}
187+
188+
assert(memory_plan.value_allocations.size() == values.size());
189+
for (auto i = 0u; i < values.size(); i++) {
190+
std::visit(
191+
overloaded{
192+
[&](plan::ArenaAllocation& alloc) {
193+
auto& storage = values[i].storage;
194+
assert(storage.owner == core::StorageOwner::Arena);
195+
storage.data = static_cast<uint8_t*>(arena.data()) + alloc.offset;
196+
storage.size_in_bytes = alloc.size;
197+
},
198+
[&](plan::DynamicAllocation& alloc) {
199+
assert(values[i].storage.owner == core::StorageOwner::Self);
200+
},
201+
[&](plan::ExternalAllocation&) {
202+
assert(values[i].storage.owner == core::StorageOwner::External);
203+
}},
204+
memory_plan.value_allocations.at(i));
205+
}
206+
207+
for (size_t i = 0; i < values.size(); i++) {
208+
if (std::holds_alternative<plan::ArenaAllocation>(
209+
memory_plan.value_allocations[i])) {
210+
auto& spec = memory_plan.value_specs[i];
211+
values[i].sizes.resize(spec.sizes.size());
212+
for (size_t d = 0; d < spec.sizes.size(); d++) {
213+
auto& dim = spec.sizes[d];
214+
int64_t size = dim.offset;
215+
for (auto& term : dim.coeffs) {
216+
auto& bound = shape_env.bounds[term.sym];
217+
size += term.coefficient * static_cast<int64_t>(*bound.max);
218+
}
219+
values[i].sizes[d] = static_cast<uint64_t>(size);
220+
}
221+
}
222+
}
223+
224+
for (auto& step : plan.steps) {
225+
auto* op_step = std::get_if<plan::RunOperatorStep>(&step);
226+
if (!op_step)
227+
continue;
228+
229+
std::vector<graph::TensorSpec> input_specs;
230+
input_specs.reserve(op_step->input_slots.size());
231+
for (auto slot : op_step->input_slots) {
232+
input_specs.push_back(memory_plan.value_specs[slot]);
233+
}
234+
op_step->op->reshape({input_specs.data(), input_specs.size()});
235+
}
236+
237+
return runtime::Error::Ok;
238+
}
239+
240+
runtime::Result<Executor> Executor::build(graph::Graph& graph) {
241+
auto t_build_start = std::chrono::steady_clock::now();
242+
243+
auto init_status = xnn_initialize(nullptr);
244+
ET_CHECK_OR_RETURN_ERROR(
245+
init_status == xnn_status_success,
246+
Internal,
247+
"Failed to initialize XNNPACK: 0x%x",
248+
(unsigned int)init_status);
249+
250+
auto t0 = std::chrono::steady_clock::now();
251+
ET_UNWRAP(execution_plan, plan::create_execution_plan(graph));
252+
auto t1 = std::chrono::steady_clock::now();
253+
auto memory_plan = plan::create_memory_plan(graph, execution_plan);
254+
auto t2 = std::chrono::steady_clock::now();
255+
256+
std::vector<plan::ValueSlot> output_slots;
257+
output_slots.reserve(graph.outputs.size());
258+
for (auto& vh : graph.outputs) {
259+
output_slots.push_back(graph.nodes[vh.node].tag + vh.output);
260+
}
261+
262+
auto num_slots = memory_plan.value_allocations.size();
263+
std::vector<core::Tensor> values(num_slots);
264+
for (size_t i = 0; i < num_slots; i++) {
265+
values[i].dtype = memory_plan.value_specs[i].dtype;
266+
if (std::holds_alternative<plan::ArenaAllocation>(
267+
memory_plan.value_allocations[i])) {
268+
values[i].storage.owner = core::StorageOwner::Arena;
269+
}
270+
}
271+
272+
for (size_t n = 0; n < graph.nodes.size(); n++) {
273+
auto* cn = std::get_if<graph::ConstantNode>(&graph.nodes[n].value);
274+
if (!cn)
275+
continue;
276+
auto slot = graph.nodes[n].tag;
277+
values[slot].sizes = cn->tensor->sizes;
278+
values[slot].storage.data =
279+
const_cast<void*>(static_cast<const void*>(cn->tensor->storage.data));
280+
values[slot].storage.size_in_bytes = cn->tensor->storage.size_in_bytes;
281+
}
282+
283+
auto t3 = std::chrono::steady_clock::now();
284+
285+
// Let operators pre-process constant tensors (e.g., pack weights).
286+
for (auto& step : execution_plan.steps) {
287+
auto* op_step = std::get_if<plan::RunOperatorStep>(&step);
288+
if (!op_step)
289+
continue;
290+
291+
std::vector<core::Tensor*> inputs;
292+
for (auto slot : op_step->input_slots)
293+
inputs.push_back(&values[slot]);
294+
295+
std::vector<core::Tensor*> outputs;
296+
for (auto slot : op_step->output_slots)
297+
outputs.push_back(&values[slot]);
298+
299+
op_step->op->prepare(
300+
{inputs.data(), inputs.size()}, {outputs.data(), outputs.size()});
301+
}
302+
303+
auto t4 = std::chrono::steady_clock::now();
304+
ET_LOG(
305+
Info,
306+
"Executor::build create_execution_plan=%lldms create_memory_plan=%lldms "
307+
"setup_values=%lldms prepare=%lldms",
308+
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0)
309+
.count(),
310+
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
311+
.count(),
312+
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2)
313+
.count(),
314+
(long long)std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3)
315+
.count());
316+
317+
std::vector<plan::ValueSlot> input_slots(graph.input_specs.size());
318+
for (size_t n = 0; n < graph.nodes.size(); n++) {
319+
auto* in = std::get_if<graph::InputNode>(&graph.nodes[n].value);
320+
if (!in)
321+
continue;
322+
input_slots[in->input] = graph.nodes[n].tag;
323+
}
324+
325+
Executor exec;
326+
exec.input_specs = graph.input_specs;
327+
exec.input_slots = std::move(input_slots);
328+
exec.memory_plan = std::move(memory_plan);
329+
exec.plan = std::move(execution_plan);
330+
exec.shape_env = ShapeEnv(graph.symint_count());
331+
exec.output_slots = std::move(output_slots);
332+
exec.values = std::move(values);
333+
// Keep the graph (and thus all constant tensor storage) alive for the
334+
// executor's lifetime; XNNPACK references unpacked constant data directly.
335+
exec.graph = std::move(graph);
336+
return std::move(exec);
337+
}
338+
339+
} // namespace executorch::backends::xnnpack::executor
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#pragma once
2+
3+
#include <executorch/backends/xnnpack/runtime/core/tensor.h>
4+
#include <executorch/backends/xnnpack/runtime/executor/arena.h>
5+
#include <executorch/backends/xnnpack/runtime/executor/shape_env.h>
6+
#include <executorch/backends/xnnpack/runtime/graph/graph.h>
7+
#include <executorch/backends/xnnpack/runtime/plan/execution_plan.h>
8+
#include <executorch/backends/xnnpack/runtime/plan/memory_plan.h>
9+
#include <executorch/runtime/core/result.h>
10+
#include <executorch/runtime/core/span.h>
11+
12+
#include <vector>
13+
14+
namespace executorch::backends::xnnpack::executor {
15+
16+
struct Executor {
17+
// Owns the graph (and its constant tensors) for the executor's lifetime.
18+
// XNNPACK keeps pointers into unpacked constant data (e.g. PReLU slopes), so
19+
// this must outlive `plan`'s runtimes. Declared first => destroyed last.
20+
graph::Graph graph;
21+
Arena arena;
22+
std::vector<graph::TensorSpec> input_specs;
23+
std::vector<plan::ValueSlot> input_slots;
24+
plan::MemoryPlan memory_plan;
25+
plan::ExecutionPlan plan;
26+
ShapeEnv shape_env;
27+
std::vector<plan::ValueSlot> output_slots;
28+
std::vector<core::Tensor> values;
29+
30+
runtime::Result<std::vector<core::Tensor>> run(
31+
runtime::Span<core::Tensor> inputs);
32+
runtime::Error run_step(size_t step_idx, const plan::PlanStep& step);
33+
runtime::Error setup_xnn_step(const plan::RunXnnSubgraphStep& xnn);
34+
runtime::Error update_xnn_output_shapes(const plan::RunXnnSubgraphStep& xnn);
35+
runtime::Error update_planned_memory(runtime::Span<core::Tensor> inputs);
36+
37+
static runtime::Result<Executor> build(graph::Graph& graph);
38+
};
39+
40+
} // namespace executorch::backends::xnnpack::executor

0 commit comments

Comments
 (0)