37 void u8_neon_scale_nearest(
const ITensor *
src,
39 const ITensor *offsets,
40 float sampling_offset,
44 const size_t in_stride_c =
src->info()->dimension(0) +
src->info()->padding().left +
src->info()->padding().right;
45 const size_t in_stride_w =
src->info()->dimension(1) +
src->info()->padding().top +
src->info()->padding().bottom;
46 const size_t in_stride_wc = in_stride_w * in_stride_c;
47 const size_t in_dim_h =
src->info()->dimension(2);
51 const auto window_start_x =
static_cast<int32_t
>(window.x().start());
52 const auto window_end_x =
static_cast<int32_t
>(window.x().end());
53 const int window_step_x = 16;
57 Iterator out(
dst, win);
59 const uint8_t *in_ptr_start =
src->buffer() +
src->info()->offset_first_element_in_bytes();
60 const unsigned int in_stride_bytes_hwc =
src->info()->strides_in_bytes()[3];
64 [&](
const Coordinates &
id)
67 *
reinterpret_cast<const int32_t *
>(offsets->ptr_to_element(Coordinates(
id.y(),
id.z()))) * in_stride_c;
68 const auto in_hi =
static_cast<int>(
70 : std::floor((id.z() + sampling_offset) * hr));
71 const int offset_row = in_hi * in_stride_wc;
72 int32_t x = window_start_x;
73 const uint8_t *in_ptr =
reinterpret_cast<const uint8_t *
>(in_ptr_start + in_stride_bytes_hwc *
id[3]);
75 for (; x <= window_end_x - window_step_x; x += window_step_x)
80 for (; x < window_end_x; ++x)
82 *(
reinterpret_cast<uint8_t *
>(out.ptr()) + x) = *(in_ptr +
offset + offset_row + x);
88 void u8_neon_scale_bilinear(
const ITensor *
src,
90 const ITensor *offsets,
94 PixelValue constant_border_value,
95 float sampling_offset,
105 const int input_width =
src->info()->dimension(1);
106 const int input_height =
src->info()->dimension(2);
110 Iterator out(
dst, window);
111 const int in_stride_c =
src->info()->dimension(0) +
src->info()->padding().left +
src->info()->padding().right;
112 const int in_stride_wc =
113 in_stride_c * (input_width +
src->info()->padding().top +
src->info()->padding().bottom);
117 Window win_in(window);
120 Iterator in(
src, win_in);
122 const uint8_t const_border_value =
static_cast<uint8_t
>(constant_border_value.get<uint8_t>());
125 [&](
const Coordinates &
id)
128 *
reinterpret_cast<const int32_t *
>(offsets->ptr_to_element(Coordinates(
id.y(),
id.z())));
129 const auto dx_val = *
reinterpret_cast<const float *
>(dx->ptr_to_element(Coordinates(
id.y(),
id.z())));
130 const auto dy_val = *
reinterpret_cast<const float *
>(dy->ptr_to_element(Coordinates(
id.y(),
id.z())));
131 const int32_t in_hi = std::floor((
id.z() + sampling_offset) *
scale_y - sampling_offset);
132 const uint8_t *in_ptr =
133 reinterpret_cast<const uint8_t *
>(in.ptr()) +
offset * in_stride_c + in_hi * in_stride_wc;
135 const auto a00 = (0 <=
offset &&
offset < input_width && 0 <= in_hi && in_hi < input_height)
137 : const_border_value;
138 const auto a01 = (-1 <=
offset &&
offset < input_width - 1 && 0 <= in_hi && in_hi < input_height)
139 ? *(in_ptr + in_stride_c)
140 : const_border_value;
141 const auto a10 = (0 <=
offset &&
offset < input_width && -1 <= in_hi && in_hi < input_height - 1)
142 ? *(in_ptr + in_stride_wc)
143 : const_border_value;
144 const auto a11 = (-1 <=
offset &&
offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1)
145 ? *(in_ptr + in_stride_c + in_stride_wc)
146 : const_border_value;
148 *
reinterpret_cast<uint8_t *
>(out.ptr()) =
155 using ExactTagType =
typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
157 const int in_stride_x =
src->info()->strides_in_bytes()[1];
158 const int in_stride_y =
src->info()->strides_in_bytes()[2];
159 const int in_stride_b =
src->info()->strides_in_bytes()[3];
160 const int out_stride_x =
dst->info()->strides_in_bytes()[1];
161 const int out_stride_y =
dst->info()->strides_in_bytes()[2];
162 const int out_stride_b =
dst->info()->strides_in_bytes()[3];
164 const int out_dim_ch =
dst->info()->dimension(0);
165 constexpr
int step_cout = 16;
167 Window window_execution = window;
168 window_execution.set(
Window::DimX, Window::Dimension(0, 1, 1));
169 Window win_in_out(window);
170 win_in_out.set(
Window::DimY, Window::Dimension(0, 0, 0));
171 win_in_out.set(
Window::DimZ, Window::Dimension(0, 0, 0));
172 Iterator in(
src, win_in_out);
173 Iterator out(
dst, win_in_out);
175 const int xo_start = window_execution[1].start();
176 const int xo_end = window_execution[1].end();
177 const int xo_step = window_execution[1].step();
178 const int yo_start = window_execution[2].start();
179 const int yo_end = window_execution[2].end();
180 const int yo_step = window_execution[2].step();
181 const int bo_start = window_execution[3].start();
182 const int bo_end = window_execution[3].end();
183 const int bo_step = window_execution[3].step();
185 const float fp_coord_offset_y = sampling_offset * (
scale_y - 1);
186 const float fp_coord_offset_x = sampling_offset * (
scale_x - 1);
188 for (
int bo = bo_start; bo < bo_end; bo += bo_step)
190 const uint8_t *in_ptr = in.ptr() + bo * in_stride_b;
191 uint8_t *out_ptr = out.ptr() + bo * out_stride_b;
193 for (
int yo = yo_start; yo < yo_end; yo += yo_step)
196 const float yi_f = yo *
scale_y + fp_coord_offset_y;
198 const int yi =
static_cast<int>(std::floor(yi_f));
200 const float a1 = (yi_f -
static_cast<float>(yi));
201 const float b1 = (1.f - a1);
203 const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
204 const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
206 const uint8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
207 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
209 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
210 for (
int xo = xo_start; xo < xo_end; xo += xo_step)
213 const float xi_f = xo *
scale_x + fp_coord_offset_x;
215 const int xi =
static_cast<int>(std::floor(xi_f));
217 const float a = (xi_f -
static_cast<float>(xi));
218 const float b = (1.f - a);
220 const float s00_s =
b * b1;
221 const float s01_s = a * b1;
222 const float s10_s =
b * a1;
223 const float s11_s = a * a1;
230 const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
231 const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
233 const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
234 const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
235 const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
236 const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
238 uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
241 for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
243 const auto in00 =
wrapper::vloadq(in_ptr_xi0_yi0 + cout *
sizeof(uint8_t));
244 const auto in01 =
wrapper::vloadq(in_ptr_xi1_yi0 + cout *
sizeof(uint8_t));
245 const auto in10 =
wrapper::vloadq(in_ptr_xi0_yi1 + cout *
sizeof(uint8_t));
246 const auto in11 =
wrapper::vloadq(in_ptr_xi1_yi1 + cout *
sizeof(uint8_t));
300 #if defined(__aarch64__) && !defined(BARE_METAL)
301 const auto out_0_int = wrapper::vcvta<uint32_t>(out_0);
302 const auto out_1_int = wrapper::vcvta<uint32_t>(out_1);
303 const auto out_2_int = wrapper::vcvta<uint32_t>(out_2);
304 const auto out_3_int = wrapper::vcvta<uint32_t>(out_3);
305 #else // defined(__aarch64__) && !defined(BARE_METAL)
306 const auto out_0_int = wrapper::vcvt<uint32_t>(out_0);
307 const auto out_1_int = wrapper::vcvt<uint32_t>(out_1);
308 const auto out_2_int = wrapper::vcvt<uint32_t>(out_2);
309 const auto out_3_int = wrapper::vcvt<uint32_t>(out_3);
310 #endif // defined(__aarch64__) && !defined(BARE_METAL)
311 const auto low_part =
313 const auto high_part =
320 for (; cout < out_dim_ch; ++cout)
322 const uint8_t in00 = *(in_ptr_xi0_yi0 + cout *
sizeof(uint8_t));
323 const uint8_t in01 = *(in_ptr_xi1_yi0 + cout *
sizeof(uint8_t));
324 const uint8_t in10 = *(in_ptr_xi0_yi1 + cout *
sizeof(uint8_t));
325 const uint8_t in11 = *(in_ptr_xi1_yi1 + cout *
sizeof(uint8_t));
327 float out0 = in00 * s00_s;
328 out0 += in01 * s01_s;
329 out0 += in10 * s10_s;
330 out0 += in11 * s11_s;
333 #if defined(__aarch64__) && !defined(BARE_METAL)
334 *(out_ptr_xo_yo + cout *
sizeof(uint8_t)) =
static_cast<uint8_t
>(
std::round(out0));
335 #else // defined(__aarch64__) && !defined(BARE_METAL)
336 *(out_ptr_xo_yo + cout *
sizeof(uint8_t)) =
static_cast<uint8_t
>(out0);
337 #endif // defined(__aarch64__) && !defined(BARE_METAL)
349 void s8_neon_scale_bilinear(
const ITensor *
src,
351 const ITensor *offsets,
355 PixelValue constant_border_value,
356 float sampling_offset,
358 const Window &window)
363 using ExactTagType =
typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
371 const int in_stride_x =
src->info()->strides_in_bytes()[1];
372 const int in_stride_y =
src->info()->strides_in_bytes()[2];
373 const int in_stride_b =
src->info()->strides_in_bytes()[3];
374 const int out_stride_x =
dst->info()->strides_in_bytes()[1];
375 const int out_stride_y =
dst->info()->strides_in_bytes()[2];
376 const int out_stride_b =
dst->info()->strides_in_bytes()[3];
377 const int input_width =
src->info()->dimension(1);
378 const int input_height =
src->info()->dimension(2);
379 const int out_dim_ch =
dst->info()->dimension(0);
380 constexpr
int step_cout = 16;
382 Window window_execution = window;
383 window_execution.set(
Window::DimX, Window::Dimension(0, 1, 1));
384 Window win_in_out(window);
385 win_in_out.set(
Window::DimY, Window::Dimension(0, 0, 0));
386 win_in_out.set(
Window::DimZ, Window::Dimension(0, 0, 0));
387 Iterator in(
src, win_in_out);
388 Iterator out(
dst, win_in_out);
390 const int xo_start = window_execution[1].start();
391 const int xo_end = window_execution[1].end();
392 const int xo_step = window_execution[1].step();
393 const int yo_start = window_execution[2].start();
394 const int yo_end = window_execution[2].end();
395 const int yo_step = window_execution[2].step();
396 const int bo_start = window_execution[3].start();
397 const int bo_end = window_execution[3].end();
398 const int bo_step = window_execution[3].step();
400 const float fp_coord_offset_y = sampling_offset * (
scale_y - 1);
401 const float fp_coord_offset_x = sampling_offset * (
scale_x - 1);
403 for (
int bo = bo_start; bo < bo_end; bo += bo_step)
405 const int8_t *in_ptr =
reinterpret_cast<int8_t *
>(in.ptr() + bo * in_stride_b);
406 int8_t *out_ptr =
reinterpret_cast<int8_t *
>(out.ptr() + bo * out_stride_b);
408 for (
int yo = yo_start; yo < yo_end; yo += yo_step)
411 const float yi_f = yo *
scale_y + fp_coord_offset_y;
413 const int yi =
static_cast<int>(std::floor(yi_f));
415 const float a1 = (yi_f -
static_cast<float>(yi));
416 const float b1 = (1.f - a1);
418 const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
419 const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
421 const int8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
422 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
424 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
425 for (
int xo = xo_start; xo < xo_end; xo += xo_step)
428 const float xi_f = xo *
scale_x + fp_coord_offset_x;
430 const int xi =
static_cast<int>(std::floor(xi_f));
432 const float a = (xi_f -
static_cast<float>(xi));
433 const float b = (1.f - a);
435 const float s00_s =
b * b1;
436 const float s01_s = a * b1;
437 const float s10_s =
b * a1;
438 const float s11_s = a * a1;
445 const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
446 const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
448 const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
449 const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
450 const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
451 const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
453 int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
456 for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
458 const auto in00 =
wrapper::vloadq(in_ptr_xi0_yi0 + cout *
sizeof(int8_t));
459 const auto in01 =
wrapper::vloadq(in_ptr_xi1_yi0 + cout *
sizeof(int8_t));
460 const auto in10 =
wrapper::vloadq(in_ptr_xi0_yi1 + cout *
sizeof(int8_t));
461 const auto in11 =
wrapper::vloadq(in_ptr_xi1_yi1 + cout *
sizeof(int8_t));
515 #if defined(__aarch64__) && !defined(BARE_METAL)
516 const auto out_0_int = wrapper::vcvta<int32_t>(out_0);
517 const auto out_1_int = wrapper::vcvta<int32_t>(out_1);
518 const auto out_2_int = wrapper::vcvta<int32_t>(out_2);
519 const auto out_3_int = wrapper::vcvta<int32_t>(out_3);
520 #else // defined(__aarch64__) && !defined(BARE_METAL)
521 const auto out_0_int = wrapper::vcvt<int32_t>(out_0);
522 const auto out_1_int = wrapper::vcvt<int32_t>(out_1);
523 const auto out_2_int = wrapper::vcvt<int32_t>(out_2);
524 const auto out_3_int = wrapper::vcvt<int32_t>(out_3);
525 #endif // defined(__aarch64__) && !defined(BARE_METAL)
526 const auto low_part =
528 const auto high_part =
535 for (; cout < out_dim_ch; ++cout)
537 const int8_t in00 = *(in_ptr_xi0_yi0 + cout *
sizeof(int8_t));
538 const int8_t in01 = *(in_ptr_xi1_yi0 + cout *
sizeof(int8_t));
539 const int8_t in10 = *(in_ptr_xi0_yi1 + cout *
sizeof(int8_t));
540 const int8_t in11 = *(in_ptr_xi1_yi1 + cout *
sizeof(int8_t));
542 float out0 = in00 * s00_s;
543 out0 += in01 * s01_s;
544 out0 += in10 * s10_s;
545 out0 += in11 * s11_s;
548 #if defined(__aarch64__) && !defined(BARE_METAL)
549 *(out_ptr_xo_yo + cout *
sizeof(int8_t)) =
static_cast<int8_t
>(
std::round(out0));
550 #else // defined(__aarch64__) && !defined(BARE_METAL)
551 *(out_ptr_xo_yo + cout *
sizeof(int8_t)) =
static_cast<int8_t
>(out0);
552 #endif // defined(__aarch64__) && !defined(BARE_METAL)
564 void s16_neon_scale_nearest(
const ITensor *
src,
566 const ITensor *offsets,
567 float sampling_offset,
569 const Window &window)
571 const size_t in_stride_c =
src->info()->dimension(0) +
src->info()->padding().left +
src->info()->padding().right;
572 const size_t in_stride_w =
src->info()->dimension(1) +
src->info()->padding().top +
src->info()->padding().bottom;
573 const size_t in_stride_wc = in_stride_w * in_stride_c;
574 const size_t in_dim_h =
src->info()->dimension(2);
578 const auto window_start_x =
static_cast<int32_t
>(window.x().start());
579 const auto window_end_x =
static_cast<int32_t
>(window.x().end());
580 const int window_step_x = 8;
584 Iterator out(
dst, win);
586 const uint8_t *in_ptr_start =
src->buffer() +
src->info()->offset_first_element_in_bytes();
587 const unsigned int in_stride_bytes_hwc =
src->info()->strides_in_bytes()[3];
591 [&](
const Coordinates &
id)
594 *
reinterpret_cast<const int32_t *
>(offsets->ptr_to_element(Coordinates(
id.y(),
id.z()))) * in_stride_c;
595 const auto in_hi =
static_cast<int>(
597 : std::floor((id.z() + sampling_offset) * hr));
598 const int offset_row = in_hi * in_stride_wc;
599 int32_t x = window_start_x;
600 const int16_t *in_ptr =
reinterpret_cast<const int16_t *
>(in_ptr_start + in_stride_bytes_hwc *
id[3]);
602 for (; x <= window_end_x - window_step_x; x += window_step_x)
607 for (; x < window_end_x; ++x)
609 *(
reinterpret_cast<int16_t *
>(out.ptr()) + x) = *(in_ptr +
offset + offset_row + x);
615 void s16_neon_scale_bilinear(
const ITensor *
src,
617 const ITensor *offsets,
621 PixelValue constant_border_value,
622 float sampling_offset,
624 const Window &window)
630 Iterator out(
dst, window);
631 const int in_stride_c =
src->info()->dimension(0) +
src->info()->padding().left +
src->info()->padding().right;
632 const int in_dim_w =
src->info()->dimension(1);
633 const int in_dim_h =
src->info()->dimension(2);
634 const int in_stride_wc = in_stride_c * (in_dim_w +
src->info()->padding().top +
src->info()->padding().bottom);
638 Window win_in(window);
641 Iterator in(
src, win_in);
645 const int16_t const_border_value =
static_cast<int16_t
>(constant_border_value.get<int16_t>());
648 [&](
const Coordinates &
id)
651 *
reinterpret_cast<const int32_t *
>(offsets->ptr_to_element(Coordinates(
id.y(),
id.z())));
652 const auto dx_val = *
reinterpret_cast<const float *
>(dx->ptr_to_element(Coordinates(
id.y(),
id.z())));
653 const auto dy_val = *
reinterpret_cast<const float *
>(dy->ptr_to_element(Coordinates(
id.y(),
id.z())));
654 const int32_t in_hi = std::floor((
id.z() + sampling_offset) * hr - sampling_offset);
655 const int16_t *in_ptr =
656 reinterpret_cast<const int16_t *
>(in.ptr()) +
offset * in_stride_c + in_hi * in_stride_wc;
659 (0 <=
offset &&
offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
660 const auto a01 = (-1 <=
offset &&
offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
661 ? *(in_ptr + in_stride_c)
662 : const_border_value;
663 const auto a10 = (0 <=
offset &&
offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
664 ? *(in_ptr + in_stride_wc)
665 : const_border_value;
666 const auto a11 = (-1 <=
offset &&
offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
667 ? *(in_ptr + in_stride_c + in_stride_wc)
668 : const_border_value;
670 *
reinterpret_cast<int16_t *
>(out.ptr()) =
679 [&](
const Coordinates &
id)
682 *
reinterpret_cast<const int32_t *
>(offsets->ptr_to_element(Coordinates(
id.y(),
id.z())));
683 const auto dx_val = *
reinterpret_cast<const float *
>(dx->ptr_to_element(Coordinates(
id.y(),
id.z())));
684 const auto dy_val = *
reinterpret_cast<const float *
>(dy->ptr_to_element(Coordinates(
id.y(),
id.z())));
685 const int in_hi = std::floor((
id.z() + sampling_offset) * hr - sampling_offset);
687 const auto clamped_w = utility::clamp<int>(
offset, 0, in_dim_w - 1);
688 const auto clamped_w1 = utility::clamp<int>(
offset + 1, 0, in_dim_w - 1);
689 const auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
690 const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
693 *(
reinterpret_cast<const int16_t *
>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
694 const auto a01 = *(
reinterpret_cast<const int16_t *
>(in.ptr()) + clamped_w1 * in_stride_c +
695 clamped_h * in_stride_wc);
696 const auto a10 = *(
reinterpret_cast<const int16_t *
>(in.ptr()) + clamped_w * in_stride_c +
697 clamped_h1 * in_stride_wc);
698 const auto a11 = *(
reinterpret_cast<const int16_t *
>(in.ptr()) + clamped_w1 * in_stride_c +
699 clamped_h1 * in_stride_wc);
701 *
reinterpret_cast<int16_t *
>(out.ptr()) =
722 float sampling_offset,
728 s8_neon_scale_bilinear(
src,
dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
729 align_corners, window);
745 float sampling_offset,
751 u8_neon_scale_bilinear(
src,
dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
752 align_corners, window);
756 u8_neon_scale_nearest(
src,
dst, offsets, sampling_offset, align_corners, window);
768 float sampling_offset,
774 s16_neon_scale_bilinear(
src,
dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
775 align_corners, window);
779 s16_neon_scale_nearest(
src,
dst, offsets, sampling_offset, align_corners, window);