|
| 1 | +#include <executorch/backends/xnnpack/runtime/executor/executor.h> |
| 2 | + |
| 3 | +#include <executorch/backends/xnnpack/runtime/core/quant_params.h> |
| 4 | +#include <executorch/backends/xnnpack/runtime/core/variant_util.h> |
| 5 | +#include <executorch/backends/xnnpack/runtime/plan/execution_plan.h> |
| 6 | + |
| 7 | +#include <executorch/runtime/platform/log.h> |
| 8 | + |
| 9 | +#include <xnnpack.h> |
| 10 | +#include <cassert> |
| 11 | +#include <chrono> |
| 12 | +#include <cstdlib> |
| 13 | + |
| 14 | +namespace executorch::backends::xnnpack::executor { |
| 15 | + |
| 16 | +using executorch::runtime::Span; |
| 17 | + |
| 18 | +runtime::Result<std::vector<core::Tensor>> Executor::run( |
| 19 | + Span<core::Tensor> inputs) { |
| 20 | + auto t_mem_start = std::chrono::steady_clock::now(); |
| 21 | + ET_CHECK_OK_OR_RETURN_ERROR(update_planned_memory(inputs)); |
| 22 | + auto t_mem_end = std::chrono::steady_clock::now(); |
| 23 | + ET_LOG( |
| 24 | + Info, |
| 25 | + "update_planned_memory %lldus", |
| 26 | + (long long)std::chrono::duration_cast<std::chrono::microseconds>( |
| 27 | + t_mem_end - t_mem_start) |
| 28 | + .count()); |
| 29 | + |
| 30 | + for (size_t si = 0; si < plan.steps.size(); si++) { |
| 31 | + ET_CHECK_OK_OR_RETURN_ERROR(run_step(si, plan.steps[si])); |
| 32 | + } |
| 33 | + |
| 34 | + std::vector<core::Tensor> outputs; |
| 35 | + outputs.reserve(output_slots.size()); |
| 36 | + for (auto slot : output_slots) { |
| 37 | + auto& val = values[slot]; |
| 38 | + core::Tensor t; |
| 39 | + t.dtype = val.dtype; |
| 40 | + t.sizes = val.sizes; |
| 41 | + t.storage.data = val.storage.data; |
| 42 | + t.storage.size_in_bytes = val.storage.size_in_bytes; |
| 43 | + t.storage.owner = core::StorageOwner::External; |
| 44 | + outputs.push_back(std::move(t)); |
| 45 | + } |
| 46 | + return outputs; |
| 47 | +} |
| 48 | + |
| 49 | +runtime::Error Executor::run_step(size_t step_idx, const plan::PlanStep& step) { |
| 50 | + runtime::Error err = runtime::Error::Ok; |
| 51 | + std::visit( |
| 52 | + overloaded{ |
| 53 | + [&](const plan::RunOperatorStep& s) { |
| 54 | + std::vector<core::Tensor*> inputs; |
| 55 | + inputs.reserve(s.input_slots.size()); |
| 56 | + for (auto slot : s.input_slots) { |
| 57 | + inputs.push_back(&values[slot]); |
| 58 | + } |
| 59 | + |
| 60 | + std::vector<core::Tensor*> outputs; |
| 61 | + outputs.reserve(s.output_slots.size()); |
| 62 | + for (auto slot : s.output_slots) { |
| 63 | + outputs.push_back(&values[slot]); |
| 64 | + } |
| 65 | + |
| 66 | + auto t0 = std::chrono::steady_clock::now(); |
| 67 | + s.op->execute( |
| 68 | + {inputs.data(), inputs.size()}, |
| 69 | + {outputs.data(), outputs.size()}); |
| 70 | + auto t1 = std::chrono::steady_clock::now(); |
| 71 | + auto us = |
| 72 | + std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0) |
| 73 | + .count(); |
| 74 | + ET_LOG(Info, "OpStep[%zu] %lldus", step_idx, (long long)us); |
| 75 | + }, |
| 76 | + [&](const plan::RunXnnSubgraphStep& s) { |
| 77 | + auto t0 = std::chrono::steady_clock::now(); |
| 78 | + err = setup_xnn_step(s); |
| 79 | + if (err != runtime::Error::Ok) { |
| 80 | + return; |
| 81 | + } |
| 82 | + auto t1 = std::chrono::steady_clock::now(); |
| 83 | + auto status = xnn_invoke_runtime(s.runtime.get()); |
| 84 | + if (status != xnn_status_success) { |
| 85 | + ET_LOG( |
| 86 | + Error, |
| 87 | + "xnn_invoke_runtime failed: 0x%x", |
| 88 | + (unsigned int)status); |
| 89 | + err = runtime::Error::Internal; |
| 90 | + return; |
| 91 | + } |
| 92 | + auto t2 = std::chrono::steady_clock::now(); |
| 93 | + err = update_xnn_output_shapes(s); |
| 94 | + if (err != runtime::Error::Ok) { |
| 95 | + return; |
| 96 | + } |
| 97 | + auto setup_us = |
| 98 | + std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0) |
| 99 | + .count(); |
| 100 | + auto invoke_us = |
| 101 | + std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1) |
| 102 | + .count(); |
| 103 | + ET_LOG( |
| 104 | + Info, |
| 105 | + "XnnStep setup=%lldus invoke=%lldus", |
| 106 | + (long long)setup_us, |
| 107 | + (long long)invoke_us); |
| 108 | + }}, |
| 109 | + step); |
| 110 | + return err; |
| 111 | +} |
| 112 | + |
| 113 | +runtime::Error Executor::setup_xnn_step(const plan::RunXnnSubgraphStep& xnn) { |
| 114 | + auto* rt = xnn.runtime.get(); |
| 115 | + |
| 116 | + for (uint32_t eid = 0; eid < xnn.num_external_inputs; eid++) { |
| 117 | + auto slot = xnn.external_value_slots[eid]; |
| 118 | + auto& tensor = values[slot]; |
| 119 | + std::vector<size_t> dims(tensor.sizes.begin(), tensor.sizes.end()); |
| 120 | + auto status = xnn_reshape_external_value(rt, eid, dims.size(), dims.data()); |
| 121 | + ET_CHECK_OR_RETURN_ERROR( |
| 122 | + status == xnn_status_success, |
| 123 | + Internal, |
| 124 | + "xnn_reshape_external_value failed: 0x%x", |
| 125 | + (unsigned int)status); |
| 126 | + } |
| 127 | + |
| 128 | + ET_CHECK_OR_RETURN_ERROR( |
| 129 | + xnn_reshape_runtime(rt) == xnn_status_success, |
| 130 | + Internal, |
| 131 | + "xnn_reshape_runtime failed"); |
| 132 | + |
| 133 | + std::vector<xnn_external_value> externals(xnn.external_value_slots.size()); |
| 134 | + for (uint32_t eid = 0; eid < xnn.external_value_slots.size(); eid++) { |
| 135 | + auto slot = xnn.external_value_slots[eid]; |
| 136 | + externals[eid].id = eid; |
| 137 | + externals[eid].data = values[slot].storage.data; |
| 138 | + } |
| 139 | + ET_CHECK_OR_RETURN_ERROR( |
| 140 | + xnn_setup_runtime_v2(rt, externals.size(), externals.data()) == |
| 141 | + xnn_status_success, |
| 142 | + Internal, |
| 143 | + "xnn_setup_runtime_v2 failed"); |
| 144 | + return runtime::Error::Ok; |
| 145 | +} |
| 146 | + |
| 147 | +runtime::Error Executor::update_xnn_output_shapes( |
| 148 | + const plan::RunXnnSubgraphStep& xnn) { |
| 149 | + auto* rt = xnn.runtime.get(); |
| 150 | + |
| 151 | + for (uint32_t eid = 0; eid < xnn.external_value_slots.size(); eid++) { |
| 152 | + auto slot = xnn.external_value_slots[eid]; |
| 153 | + size_t num_dims = 0; |
| 154 | + size_t dims[XNN_MAX_TENSOR_DIMS]; |
| 155 | + ET_CHECK_OR_RETURN_ERROR( |
| 156 | + xnn_get_external_value_shape(rt, eid, &num_dims, dims) == |
| 157 | + xnn_status_success, |
| 158 | + Internal, |
| 159 | + "xnn_get_external_value_shape failed"); |
| 160 | + |
| 161 | + values[slot].sizes.resize(num_dims); |
| 162 | + for (size_t d = 0; d < num_dims; d++) { |
| 163 | + values[slot].sizes[d] = dims[d]; |
| 164 | + } |
| 165 | + size_t nbytes = core::byte_stride(values[slot].dtype); |
| 166 | + for (size_t d = 0; d < num_dims; d++) { |
| 167 | + nbytes *= dims[d]; |
| 168 | + } |
| 169 | + values[slot].storage.size_in_bytes = nbytes; |
| 170 | + } |
| 171 | + return runtime::Error::Ok; |
| 172 | +} |
| 173 | + |
| 174 | +runtime::Error Executor::update_planned_memory(Span<core::Tensor> inputs) { |
| 175 | + ET_CHECK_OK_OR_RETURN_ERROR( |
| 176 | + shape_env.specialize({input_specs.data(), input_specs.size()}, inputs)); |
| 177 | + |
| 178 | + ET_CHECK_OK_OR_RETURN_ERROR(memory_plan.replan(shape_env)); |
| 179 | + ET_CHECK_OK_OR_RETURN_ERROR(arena.resize(memory_plan.arena_size)); |
| 180 | + |
| 181 | + for (size_t i = 0; i < inputs.size(); i++) { |
| 182 | + auto slot = input_slots[i]; |
| 183 | + values[slot].sizes = inputs[i].sizes; |
| 184 | + values[slot].storage.data = inputs[i].storage.data; |
| 185 | + values[slot].storage.size_in_bytes = inputs[i].storage.size_in_bytes; |
| 186 | + } |
| 187 | + |
| 188 | + assert(memory_plan.value_allocations.size() == values.size()); |
| 189 | + for (auto i = 0u; i < values.size(); i++) { |
| 190 | + std::visit( |
| 191 | + overloaded{ |
| 192 | + [&](plan::ArenaAllocation& alloc) { |
| 193 | + auto& storage = values[i].storage; |
| 194 | + assert(storage.owner == core::StorageOwner::Arena); |
| 195 | + storage.data = static_cast<uint8_t*>(arena.data()) + alloc.offset; |
| 196 | + storage.size_in_bytes = alloc.size; |
| 197 | + }, |
| 198 | + [&](plan::DynamicAllocation& alloc) { |
| 199 | + assert(values[i].storage.owner == core::StorageOwner::Self); |
| 200 | + }, |
| 201 | + [&](plan::ExternalAllocation&) { |
| 202 | + assert(values[i].storage.owner == core::StorageOwner::External); |
| 203 | + }}, |
| 204 | + memory_plan.value_allocations.at(i)); |
| 205 | + } |
| 206 | + |
| 207 | + for (size_t i = 0; i < values.size(); i++) { |
| 208 | + if (std::holds_alternative<plan::ArenaAllocation>( |
| 209 | + memory_plan.value_allocations[i])) { |
| 210 | + auto& spec = memory_plan.value_specs[i]; |
| 211 | + values[i].sizes.resize(spec.sizes.size()); |
| 212 | + for (size_t d = 0; d < spec.sizes.size(); d++) { |
| 213 | + auto& dim = spec.sizes[d]; |
| 214 | + int64_t size = dim.offset; |
| 215 | + for (auto& term : dim.coeffs) { |
| 216 | + auto& bound = shape_env.bounds[term.sym]; |
| 217 | + size += term.coefficient * static_cast<int64_t>(*bound.max); |
| 218 | + } |
| 219 | + values[i].sizes[d] = static_cast<uint64_t>(size); |
| 220 | + } |
| 221 | + } |
| 222 | + } |
| 223 | + |
| 224 | + for (auto& step : plan.steps) { |
| 225 | + auto* op_step = std::get_if<plan::RunOperatorStep>(&step); |
| 226 | + if (!op_step) |
| 227 | + continue; |
| 228 | + |
| 229 | + std::vector<graph::TensorSpec> input_specs; |
| 230 | + input_specs.reserve(op_step->input_slots.size()); |
| 231 | + for (auto slot : op_step->input_slots) { |
| 232 | + input_specs.push_back(memory_plan.value_specs[slot]); |
| 233 | + } |
| 234 | + op_step->op->reshape({input_specs.data(), input_specs.size()}); |
| 235 | + } |
| 236 | + |
| 237 | + return runtime::Error::Ok; |
| 238 | +} |
| 239 | + |
| 240 | +runtime::Result<Executor> Executor::build(graph::Graph& graph) { |
| 241 | + auto t_build_start = std::chrono::steady_clock::now(); |
| 242 | + |
| 243 | + auto init_status = xnn_initialize(nullptr); |
| 244 | + ET_CHECK_OR_RETURN_ERROR( |
| 245 | + init_status == xnn_status_success, |
| 246 | + Internal, |
| 247 | + "Failed to initialize XNNPACK: 0x%x", |
| 248 | + (unsigned int)init_status); |
| 249 | + |
| 250 | + auto t0 = std::chrono::steady_clock::now(); |
| 251 | + ET_UNWRAP(execution_plan, plan::create_execution_plan(graph)); |
| 252 | + auto t1 = std::chrono::steady_clock::now(); |
| 253 | + auto memory_plan = plan::create_memory_plan(graph, execution_plan); |
| 254 | + auto t2 = std::chrono::steady_clock::now(); |
| 255 | + |
| 256 | + std::vector<plan::ValueSlot> output_slots; |
| 257 | + output_slots.reserve(graph.outputs.size()); |
| 258 | + for (auto& vh : graph.outputs) { |
| 259 | + output_slots.push_back(graph.nodes[vh.node].tag + vh.output); |
| 260 | + } |
| 261 | + |
| 262 | + auto num_slots = memory_plan.value_allocations.size(); |
| 263 | + std::vector<core::Tensor> values(num_slots); |
| 264 | + for (size_t i = 0; i < num_slots; i++) { |
| 265 | + values[i].dtype = memory_plan.value_specs[i].dtype; |
| 266 | + if (std::holds_alternative<plan::ArenaAllocation>( |
| 267 | + memory_plan.value_allocations[i])) { |
| 268 | + values[i].storage.owner = core::StorageOwner::Arena; |
| 269 | + } |
| 270 | + } |
| 271 | + |
| 272 | + for (size_t n = 0; n < graph.nodes.size(); n++) { |
| 273 | + auto* cn = std::get_if<graph::ConstantNode>(&graph.nodes[n].value); |
| 274 | + if (!cn) |
| 275 | + continue; |
| 276 | + auto slot = graph.nodes[n].tag; |
| 277 | + values[slot].sizes = cn->tensor->sizes; |
| 278 | + values[slot].storage.data = |
| 279 | + const_cast<void*>(static_cast<const void*>(cn->tensor->storage.data)); |
| 280 | + values[slot].storage.size_in_bytes = cn->tensor->storage.size_in_bytes; |
| 281 | + } |
| 282 | + |
| 283 | + auto t3 = std::chrono::steady_clock::now(); |
| 284 | + |
| 285 | + // Let operators pre-process constant tensors (e.g., pack weights). |
| 286 | + for (auto& step : execution_plan.steps) { |
| 287 | + auto* op_step = std::get_if<plan::RunOperatorStep>(&step); |
| 288 | + if (!op_step) |
| 289 | + continue; |
| 290 | + |
| 291 | + std::vector<core::Tensor*> inputs; |
| 292 | + for (auto slot : op_step->input_slots) |
| 293 | + inputs.push_back(&values[slot]); |
| 294 | + |
| 295 | + std::vector<core::Tensor*> outputs; |
| 296 | + for (auto slot : op_step->output_slots) |
| 297 | + outputs.push_back(&values[slot]); |
| 298 | + |
| 299 | + op_step->op->prepare( |
| 300 | + {inputs.data(), inputs.size()}, {outputs.data(), outputs.size()}); |
| 301 | + } |
| 302 | + |
| 303 | + auto t4 = std::chrono::steady_clock::now(); |
| 304 | + ET_LOG( |
| 305 | + Info, |
| 306 | + "Executor::build create_execution_plan=%lldms create_memory_plan=%lldms " |
| 307 | + "setup_values=%lldms prepare=%lldms", |
| 308 | + (long long)std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0) |
| 309 | + .count(), |
| 310 | + (long long)std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1) |
| 311 | + .count(), |
| 312 | + (long long)std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2) |
| 313 | + .count(), |
| 314 | + (long long)std::chrono::duration_cast<std::chrono::milliseconds>(t4 - t3) |
| 315 | + .count()); |
| 316 | + |
| 317 | + std::vector<plan::ValueSlot> input_slots(graph.input_specs.size()); |
| 318 | + for (size_t n = 0; n < graph.nodes.size(); n++) { |
| 319 | + auto* in = std::get_if<graph::InputNode>(&graph.nodes[n].value); |
| 320 | + if (!in) |
| 321 | + continue; |
| 322 | + input_slots[in->input] = graph.nodes[n].tag; |
| 323 | + } |
| 324 | + |
| 325 | + Executor exec; |
| 326 | + exec.input_specs = graph.input_specs; |
| 327 | + exec.input_slots = std::move(input_slots); |
| 328 | + exec.memory_plan = std::move(memory_plan); |
| 329 | + exec.plan = std::move(execution_plan); |
| 330 | + exec.shape_env = ShapeEnv(graph.symint_count()); |
| 331 | + exec.output_slots = std::move(output_slots); |
| 332 | + exec.values = std::move(values); |
| 333 | + // Keep the graph (and thus all constant tensor storage) alive for the |
| 334 | + // executor's lifetime; XNNPACK references unpacked constant data directly. |
| 335 | + exec.graph = std::move(graph); |
| 336 | + return std::move(exec); |
| 337 | +} |
| 338 | + |
| 339 | +} // namespace executorch::backends::xnnpack::executor |
0 commit comments