|
20 | 20 |
|
21 | 21 | #include <executorch/extension/image/image_processor.h> |
22 | 22 | #include <executorch/extension/image/image_processor_apple.h> |
| 23 | +#include <executorch/extension/image/image_processor_simd.h> |
23 | 24 |
|
24 | 25 | #include <algorithm> |
25 | 26 | #include <cstring> |
@@ -391,85 +392,6 @@ size_t compute_scale_temp_size( |
391 | 392 | return temp_size > 0 ? static_cast<size_t>(temp_size) : 0; |
392 | 393 | } |
393 | 394 |
|
394 | | -// Deinterleave BGRA uint8 → planar RGB float with fused normalization. |
395 | | -// Handles offset for letterbox padding. |
396 | | -// |
397 | | -// Per channel (R, G, B): vDSP_vfltu8 reads the matching byte from BGRA via |
398 | | -// stride=4 and converts uint8→float, then vDSP_vsmsa applies the fused |
399 | | -// affine `out = in * (scale_factor / std_dev) + (-mean / std_dev)` in-place. |
400 | | -Error deinterleave_bgra_to_chw( |
401 | | - const uint8_t* bgra_data, |
402 | | - int32_t src_w, |
403 | | - int32_t src_h, |
404 | | - int32_t src_stride, |
405 | | - float* output, |
406 | | - int32_t final_w, |
407 | | - int32_t final_h, |
408 | | - int32_t offset_x, |
409 | | - int32_t offset_y, |
410 | | - const Normalization& norm) { |
411 | | - const size_t spatial = static_cast<size_t>(final_w) * final_h; |
412 | | - |
413 | | - // Per-channel affine coefficients for `out = in * a + b`. |
414 | | - // BGRA byte layout: byte 0 = B, byte 1 = G, byte 2 = R; norm.{mean,std_dev} |
415 | | - // are indexed in RGB order (channel 0 = R, 1 = G, 2 = B). |
416 | | - const float a_r = norm.scale_factor / norm.std_dev[0]; |
417 | | - const float a_g = norm.scale_factor / norm.std_dev[1]; |
418 | | - const float a_b = norm.scale_factor / norm.std_dev[2]; |
419 | | - const float b_r = -norm.mean[0] / norm.std_dev[0]; |
420 | | - const float b_g = -norm.mean[1] / norm.std_dev[1]; |
421 | | - const float b_b = -norm.mean[2] / norm.std_dev[2]; |
422 | | - |
423 | | - // When the bias is zero (e.g. zeroToOne / mean=0), a plain scale (vsmul) is |
424 | | - // cheaper than the fused scale+add (vsmsa). |
425 | | - const bool no_offset = (b_r == 0.0f && b_g == 0.0f && b_b == 0.0f); |
426 | | - auto scale_bias = |
427 | | - [no_offset](float* p, const float* a, const float* b, vDSP_Length n) { |
428 | | - if (no_offset) { |
429 | | - vDSP_vsmul(p, 1, a, p, 1, n); |
430 | | - } else { |
431 | | - vDSP_vsmsa(p, 1, a, b, p, 1, n); |
432 | | - } |
433 | | - }; |
434 | | - |
435 | | - // Output planes in CHW order: R, G, B. Each plane is final_w × final_h |
436 | | - // floats; we write a src_h × src_w region starting at (offset_y, offset_x). |
437 | | - float* r_plane = output + 0 * spatial; |
438 | | - float* g_plane = output + 1 * spatial; |
439 | | - float* b_plane = output + 2 * spatial; |
440 | | - |
441 | | - // Fast path: source is contiguous and destination region is the entire |
442 | | - // plane (offsets 0, src dims == final dims). |
443 | | - if (src_stride == src_w * 4 && offset_x == 0 && offset_y == 0 && |
444 | | - src_w == final_w && src_h == final_h) { |
445 | | - const vDSP_Length n = static_cast<vDSP_Length>(src_w) * src_h; |
446 | | - vDSP_vfltu8(bgra_data + 2, 4, r_plane, 1, n); |
447 | | - scale_bias(r_plane, &a_r, &b_r, n); |
448 | | - vDSP_vfltu8(bgra_data + 1, 4, g_plane, 1, n); |
449 | | - scale_bias(g_plane, &a_g, &b_g, n); |
450 | | - vDSP_vfltu8(bgra_data + 0, 4, b_plane, 1, n); |
451 | | - scale_bias(b_plane, &a_b, &b_b, n); |
452 | | - return Error::Ok; |
453 | | - } |
454 | | - |
455 | | - // Slow path: row-by-row to handle stride padding and/or letterbox offsets. |
456 | | - for (int32_t y = 0; y < src_h; ++y) { |
457 | | - const uint8_t* src_row = bgra_data + y * src_stride; |
458 | | - const ptrdiff_t dst_off = (y + offset_y) * final_w + offset_x; |
459 | | - float* r_dst = r_plane + dst_off; |
460 | | - float* g_dst = g_plane + dst_off; |
461 | | - float* b_dst = b_plane + dst_off; |
462 | | - const vDSP_Length n = static_cast<vDSP_Length>(src_w); |
463 | | - vDSP_vfltu8(src_row + 2, 4, r_dst, 1, n); |
464 | | - scale_bias(r_dst, &a_r, &b_r, n); |
465 | | - vDSP_vfltu8(src_row + 1, 4, g_dst, 1, n); |
466 | | - scale_bias(g_dst, &a_g, &b_g, n); |
467 | | - vDSP_vfltu8(src_row + 0, 4, b_dst, 1, n); |
468 | | - scale_bias(b_dst, &a_b, &b_b, n); |
469 | | - } |
470 | | - return Error::Ok; |
471 | | -} |
472 | | - |
473 | 395 | // Rotate an interleaved BGRA (ARGB8888 layout) buffer by `orientation` using |
474 | 396 | // vImage's SIMD/cache-aware 90-degree rotation, writing a tightly-packed result |
475 | 397 | // into `scratch`. UP is handled by the caller (no rotation). out_data/out_w/ |
@@ -590,11 +512,16 @@ Error normalize_bgra_into( |
590 | 512 | offset_y = offset.second; |
591 | 513 | } |
592 | 514 |
|
593 | | - return deinterleave_bgra_to_chw( |
| 515 | + // BGRA byte layout: B=0, G=1, R=2 (alpha dropped); norm is RGB-indexed. |
| 516 | + return deinterleave_to_chw( |
594 | 517 | bgra_data, |
595 | 518 | width, |
596 | 519 | height, |
597 | 520 | stride, |
| 521 | + /*in_channels=*/4, |
| 522 | + /*r_off=*/2, |
| 523 | + /*g_off=*/1, |
| 524 | + /*b_off=*/0, |
598 | 525 | out, |
599 | 526 | final_w, |
600 | 527 | final_h, |
@@ -1380,6 +1307,7 @@ Error process_pixelbuffer_into( |
1380 | 1307 |
|
1381 | 1308 | // Allocate a CHW float tensor sized to the configured target and fill it via |
1382 | 1309 | // process_pixelbuffer_into. |
| 1310 | +// cppcheck-suppress unusedFunction |
1383 | 1311 | Result<TensorPtr> process_pixelbuffer( |
1384 | 1312 | const ImageProcessor& processor, |
1385 | 1313 | CVPixelBufferRef pixelBuffer, |
|
0 commit comments