24 #ifndef ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
25 #define ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H
38 #define DECLARE_SCALE_KERNEL(func_name) \
39 void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
40 InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, \
41 float sampling_offset, bool align_corners, const Window &window)
52 #undef DECLARE_SCALE_KERNEL
54 #ifdef ENABLE_NCHW_KERNELS
56 void scale_nearest_nchw(
const ITensor *
src,
60 const ITensor *offsets,
61 PixelValue constant_border_value,
62 float sampling_offset,
68 const size_t in_stride_x =
src->info()->dimension(0) +
src->info()->padding().left +
src->info()->padding().right;
76 Window win_in(window);
84 for (
size_t d =
Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
86 win_off.set(d, Window::Dimension(0, 0, 0));
90 Iterator src_i(
src, win_in);
91 Iterator dst_i(
dst, window);
92 Iterator offsets_i(offsets, win_off);
95 [&](
const Coordinates &
id)
97 const auto offsets_ptr =
reinterpret_cast<const int32_t *
>(offsets_i.ptr());
98 const auto in_yi =
static_cast<int32_t
>(
100 : std::floor((id.y() + sampling_offset) * hr));
101 const int32_t offset_row = in_yi * in_stride_x;
102 *
reinterpret_cast<T *
>(dst_i.ptr()) =
103 *(
reinterpret_cast<const T *
>(src_i.ptr()) + offsets_ptr[0] + offset_row);
105 src_i, offsets_i, dst_i);
108 template <
typename T>
113 const ITensor *offsets,
115 PixelValue constant_border_value,
116 float sampling_offset,
118 const Window &window)
129 Window win_in(window);
133 for (
size_t d =
Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
135 win_off.set(d, Window::Dimension(0, 0, 0));
138 Iterator src_i(
src, win_in);
139 Iterator dst_i(
dst, window);
140 Iterator offsets_i(offsets, win_off);
141 Iterator dx_i(dx, win_off);
142 Iterator dy_i(dy, win_off);
144 const int32_t in_dim_w =
src->info()->dimension(0);
145 const int32_t in_dim_h =
src->info()->dimension(1);
146 const int32_t in_stride_w = in_dim_w +
src->info()->padding().left +
src->info()->padding().right;
150 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
151 using ConstType =
typename std::conditional<std::is_same<T, float16_t>::value,
half, T>
::type;
155 const T const_border_value =
static_cast<T
>(constant_border_value.get<ConstType>());
158 [&](
const Coordinates &
id)
160 const int32_t index_h = std::floor((
id.y() + sampling_offset) * hr - sampling_offset);
161 const auto index_w = *(
reinterpret_cast<const int32_t *
>(offsets_i.ptr()));
162 const auto dx_val = *(
reinterpret_cast<const float *
>(dx_i.ptr()));
163 const auto dy_val = *(
reinterpret_cast<const float *
>(dy_i.ptr()));
164 const auto pixel_row_ptr =
reinterpret_cast<const T *
>(src_i.ptr());
166 const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
167 ? (*(pixel_row_ptr + index_w + index_h * in_stride_w))
168 : const_border_value;
169 const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
170 ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w))
171 : const_border_value;
172 const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
173 ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w))
174 : const_border_value;
175 const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
176 ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w))
177 : const_border_value;
179 *
reinterpret_cast<T *
>(dst_i.ptr()) =
182 src_i, offsets_i, dx_i, dy_i, dst_i);
188 [&](
const Coordinates &
id)
190 const int index_h = std::floor((
id.y() + sampling_offset) * hr - sampling_offset);
191 const auto index_w = *(
reinterpret_cast<const int32_t *
>(offsets_i.ptr()));
192 const auto dx_val = *(
reinterpret_cast<const float *
>(dx_i.ptr()));
193 const auto dy_val = *(
reinterpret_cast<const float *
>(dy_i.ptr()));
194 const auto pixel_row_ptr =
reinterpret_cast<const T *
>(src_i.ptr());
196 auto clamped_x = utility::clamp<int>(index_w, 0, in_dim_w - 1);
197 auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
198 auto clamped_y = utility::clamp<int>(index_h, 0, in_dim_h - 1);
199 auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
201 const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
202 const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
203 const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
204 const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
206 *
reinterpret_cast<T *
>(dst_i.ptr()) =
209 src_i, offsets_i, dx_i, dy_i, dst_i);
216 #endif // ENABLE_NCHW_KERNELS
218 template <
typename T>
222 float sampling_offset,
234 const int in_stride_y =
src->info()->strides_in_bytes()[1];
235 const int in_stride_z =
src->info()->strides_in_bytes()[2];
236 const int in_stride_w =
src->info()->strides_in_bytes()[3];
237 const int out_stride_y =
dst->info()->strides_in_bytes()[1];
238 const int out_stride_z =
dst->info()->strides_in_bytes()[2];
239 const int out_stride_w =
dst->info()->strides_in_bytes()[3];
240 const int out_dim_ch =
dst->info()->dimension(0);
241 const int step_cout = 16 /
sizeof(T);
243 Window window_execution = window;
245 Window win_in_out(window);
251 const int xo_start = window_execution.
y().
start();
252 const int xo_end = window_execution.
y().
end();
253 const int xo_step = window_execution.
y().
step();
254 const int yo_start = window_execution.
z().
start();
255 const int yo_end = window_execution.
z().
end();
256 const int yo_step = window_execution.
z().
step();
257 const int bo_start = window_execution[3].start();
258 const int bo_end = window_execution[3].end();
259 const int bo_step = window_execution[3].step();
261 for (
int bo = bo_start; bo < bo_end; bo += bo_step)
263 const uint8_t *in_ptr_base = in.
ptr() + bo * in_stride_w;
264 uint8_t *out_ptr_base = out.
ptr() + bo * out_stride_w;
266 for (
int yo = yo_start; yo < yo_end; yo += yo_step)
269 float yi_f = ((yo + sampling_offset) *
scale_y);
277 yi =
static_cast<int>(std::floor(yi_f));
280 for (
int xo = xo_start; xo < xo_end; xo += xo_step)
283 float xi_f = ((xo + sampling_offset) *
scale_x);
291 xi =
static_cast<int>(std::floor(xi_f));
294 const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
295 uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
298 for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
300 auto out0 =
wrapper::vloadq(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T)));
301 wrapper::vstore(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T)), out0);
304 for (; cout < out_dim_ch; ++cout)
306 auto out0 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T)));
307 *(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T))) = out0;
314 template <
typename T>
322 float sampling_offset,
337 const int in_stride_y =
src->info()->strides_in_bytes()[1];
338 const int in_stride_z =
src->info()->strides_in_bytes()[2];
339 const int in_stride_w =
src->info()->strides_in_bytes()[3];
340 const int out_stride_y =
dst->info()->strides_in_bytes()[1];
341 const int out_stride_z =
dst->info()->strides_in_bytes()[2];
342 const int out_stride_w =
dst->info()->strides_in_bytes()[3];
343 const int in_dim_w =
src->info()->dimension(1);
344 const int in_dim_h =
src->info()->dimension(2);
345 const int out_dim_ch =
dst->info()->dimension(0);
346 const int step_cout = 16 /
sizeof(T);
348 Window window_execution = window;
350 Window win_in_out(window);
356 const int xo_start = window_execution.
y().
start();
357 const int xo_end = window_execution.
y().
end();
358 const int xo_step = window_execution.
y().
step();
359 const int yo_start = window_execution.
z().
start();
360 const int yo_end = window_execution.
z().
end();
361 const int yo_step = window_execution.
z().
step();
362 const int bo_start = window_execution[3].start();
363 const int bo_end = window_execution[3].end();
364 const int bo_step = window_execution[3].step();
368 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
369 using ConstType =
typename std::conditional<std::is_same<T, float16_t>::value,
half, T>
::type;
373 const T const_border_value =
static_cast<T
>(constant_border_value.
get<ConstType>());
375 for (
int bo = bo_start; bo < bo_end; bo += bo_step)
377 const uint8_t *in_ptr_base = in.
ptr() + bo * in_stride_w;
378 uint8_t *out_ptr_base = out.
ptr() + bo * out_stride_w;
380 for (
int yo = yo_start; yo < yo_end; yo += yo_step)
383 const float yi_f = ((yo + sampling_offset) *
scale_y - sampling_offset);
385 const auto yi =
static_cast<int>(std::floor(yi_f));
387 const auto a1 = (yi_f -
static_cast<float>(yi));
388 const auto b1 = (1.f - a1);
390 for (
int xo = xo_start; xo < xo_end; xo += xo_step)
393 const float xi_f = ((xo + sampling_offset) *
scale_x - sampling_offset);
395 const auto xi =
static_cast<int>(std::floor(xi_f));
397 const auto a = (xi_f -
static_cast<float>(xi));
398 const auto b = (1.f - a);
400 const auto s00_s =
static_cast<T
>(
b * b1);
401 const auto s01_s =
static_cast<T
>(a * b1);
402 const auto s10_s =
static_cast<T
>(
b * a1);
403 const auto s11_s =
static_cast<T
>(a * a1);
405 const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
406 uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
409 for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
411 auto in00 =
wrapper::vdup_n(
static_cast<T
>(const_border_value), ExactTagType{});
412 auto in01 =
wrapper::vdup_n(
static_cast<T
>(const_border_value), ExactTagType{});
413 auto in10 =
wrapper::vdup_n(
static_cast<T
>(const_border_value), ExactTagType{});
414 auto in11 =
wrapper::vdup_n(
static_cast<T
>(const_border_value), ExactTagType{});
415 if ((yi >= 0) && (yi < in_dim_h))
417 if ((xi >= 0) && (xi < in_dim_w))
419 in00 =
wrapper::vloadq(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T)));
421 if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
424 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_y));
427 if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
429 if ((xi >= 0) && (xi < in_dim_w))
432 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_z));
434 if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
437 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_y + in_stride_z));
450 wrapper::vstore(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T)), out0);
453 for (; cout < out_dim_ch; ++cout)
455 auto in00 =
static_cast<T
>(const_border_value);
456 auto in01 =
static_cast<T
>(const_border_value);
457 auto in10 =
static_cast<T
>(const_border_value);
458 auto in11 =
static_cast<T
>(const_border_value);
459 if ((yi >= 0) && (yi < in_dim_h))
461 if ((xi >= 0) && (xi < in_dim_w))
463 in00 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T)));
465 if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
467 in01 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_y));
470 if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
472 if ((xi >= 0) && (xi < in_dim_w))
474 in10 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_z));
476 if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
479 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_y + in_stride_z));
482 auto out0 =
static_cast<T
>(0);
483 out0 += in00 * s00_s;
484 out0 += in01 * s01_s;
485 out0 += in10 * s10_s;
486 out0 += in11 * s11_s;
487 *(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T))) = out0;
495 for (
int bo = bo_start; bo < bo_end; bo += bo_step)
497 const uint8_t *in_ptr = in.
ptr() + bo * in_stride_w;
498 uint8_t *out_ptr = out.
ptr() + bo * out_stride_w;
500 for (
int yo = yo_start; yo < yo_end; yo += yo_step)
503 const float yi_f = ((yo + sampling_offset) *
scale_y - sampling_offset);
505 const auto yi =
static_cast<int>(std::floor(yi_f));
507 const auto a1 = (yi_f -
static_cast<float>(yi));
508 const auto b1 = (1.f - a1);
510 const int yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1);
511 const int yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1);
513 const int yi0_offset = yi0 * in_stride_z;
514 const int yi1_offset = yi1 * in_stride_z;
516 const int y_offset = yo * out_stride_z;
517 for (
int xo = xo_start; xo < xo_end; xo += xo_step)
520 const float xi_f = ((xo + sampling_offset) *
scale_x - sampling_offset);
522 const auto xi =
static_cast<int>(std::floor(xi_f));
524 const auto a = (xi_f -
static_cast<float>(xi));
525 const auto b = (1.f - a);
527 const auto s00_s =
static_cast<T
>(
b * b1);
528 const auto s01_s =
static_cast<T
>(a * b1);
529 const auto s10_s =
static_cast<T
>(
b * a1);
530 const auto s11_s =
static_cast<T
>(a * a1);
537 const int xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1);
538 const int xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1);
540 const int xi0_offset = xi0 * in_stride_y;
541 const int xi1_offset = xi1 * in_stride_y;
543 const int offset = xo * out_stride_y + y_offset;
546 for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
549 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi0_offset + yi0_offset));
551 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi1_offset + yi0_offset));
553 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi0_offset + yi1_offset));
555 reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi1_offset + yi1_offset));
564 for (; cout < out_dim_ch; ++cout)
567 *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi0_offset + yi0_offset));
569 *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi1_offset + yi0_offset));
571 *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi0_offset + yi1_offset));
573 *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + xi1_offset + yi1_offset));
575 T out0 = in00 * s00_s;
576 out0 += in01 * s01_s;
577 out0 += in10 * s10_s;
578 out0 += in11 * s11_s;
579 *(
reinterpret_cast<T *
>(out_ptr +
offset + cout *
sizeof(T))) = out0;
591 template <
typename T>
600 float sampling_offset,
606 bilinear_neon_scale<T>(
src,
dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
607 align_corners, window);
611 nearest_neon_scale<T>(
src,
dst, offsets, sampling_offset, align_corners, window);
617 #endif // ACL_SRC_CPU_KERNELS_SCALE_NEON_LIST_H