50 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output,
int offset = 0)
52 if (std::is_same<T, uint8_t>::value)
65 uint32x4x4_t calculate_index(uint32_t idx, T a, T
b, uint32x4x4_t c,
ReductionOperation op,
int axis)
77 uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
82 uint32x4x4_t res = {{
wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
88 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T
b, uint32x4x4_t c,
ReductionOperation op,
int axis)
90 uint32x4x4_t mask{{0}};
91 uint8x16_t mask_u8{0};
113 uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
114 {idx + 4, idx + 5, idx + 6, idx + 7},
115 {idx + 8, idx + 9, idx + 10, idx + 11},
116 {idx + 12, idx + 13, idx + 14, idx + 15}}};
119 vec_idx.val[0] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
120 vec_idx.val[1] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
121 vec_idx.val[2] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
122 vec_idx.val[3] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
125 {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
126 vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
132 template <
typename T>
133 inline typename std::enable_if<
134 std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
135 typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>
::type>
::type
143 template <
typename T>
144 inline typename std::enable_if<
145 std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
146 typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>
::type>
::type
156 template <
typename T>
157 inline typename std::enable_if<
158 std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
159 typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>
::type>
::type
167 template <
typename T>
168 inline typename std::enable_if<
169 std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
170 typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>
::type>
::type
179 template <
typename T>
180 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value,
ReductionOperation op)
182 uint32x4_t res_idx_mask{0};
183 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
187 auto pmin = calculate_min(vec_res_value);
193 auto pmax = calculate_max(vec_res_value);
203 return (res - 0xFFFFFFFF);
206 template <
typename T>
207 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value,
ReductionOperation op)
209 uint32x4x4_t res_idx_mask{{0}};
210 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
211 uint8x16_t mask_u8{0};
214 auto pmin = calculate_min(vec_res_value);
219 auto pmax = calculate_max(vec_res_value);
236 res_idx_mask.val[0] =
wrapper::vand(vec_res_idx.val[0], wide_u32_1);
237 res_idx_mask.val[1] =
wrapper::vand(vec_res_idx.val[1], wide_u32_2);
238 res_idx_mask.val[2] =
wrapper::vand(vec_res_idx.val[2], wide_u32_3);
239 res_idx_mask.val[3] =
wrapper::vand(vec_res_idx.val[3], wide_u32_4);
240 res_idx_mask.val[0] =
wrapper::vadd(res_idx_mask.val[0], mask_ones);
241 res_idx_mask.val[1] =
wrapper::vadd(res_idx_mask.val[1], mask_ones);
242 res_idx_mask.val[2] =
wrapper::vadd(res_idx_mask.val[2], mask_ones);
243 res_idx_mask.val[3] =
wrapper::vadd(res_idx_mask.val[3], mask_ones);
245 uint32_t res = 0xFFFFFFFF;
255 return (res - 0xFFFFFFFF);
258 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
261 calculate_index(uint32_t idx, float16x8_t a, float16x8_t
b, uint32x4x4_t c,
ReductionOperation op,
int axis)
263 uint32x4x2_t mask{0};
264 uint16x8_t mask_u16{0};
275 uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
278 vec_idx.val[0] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
279 vec_idx.val[1] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
281 uint32x4x4_t res = {
wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
288 inline float16x4_t calculate_min(float16x8_t in)
295 inline float16x4_t calculate_max(float16x8_t in)
303 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value,
ReductionOperation op)
305 uint32x4x2_t res_idx_mask{0};
306 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
310 auto pmin = calculate_min(vec_res_value);
315 auto pmax = calculate_max(vec_res_value);
324 res_idx_mask.val[0] =
wrapper::vand(vec_res_idx.val[0], wide_u32_1);
325 res_idx_mask.val[1] =
wrapper::vand(vec_res_idx.val[1], wide_u32_2);
326 res_idx_mask.val[0] =
wrapper::vadd(res_idx_mask.val[0], mask_ones);
327 res_idx_mask.val[1] =
wrapper::vadd(res_idx_mask.val[1], mask_ones);
329 uint32_t res = 0xFFFFFFFF;
339 return (res - 0xFFFFFFFF);
341 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
347 static void reduceX(
const Window &window,
const ITensor *
input, ITensor *output, F f,
const ReductionOperation op)
350 Window out_window(window);
351 out_window.set(
Window::DimX, Window::Dimension(0, 1, 1));
353 f(window, out_window,
input, output, op);
355 static void reduceY(
const Window &window,
const ITensor *
input, ITensor *output, F f,
const ReductionOperation op)
358 Window in_window(window);
359 Window out_window(window);
361 in_window.set(
Window::DimY, Window::Dimension(0, 1, 1));
362 out_window.set(
Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
364 f(in_window, out_window,
input, output, 1, op);
366 static void reduceZ(
const Window &window,
const ITensor *
input, ITensor *output, F f,
const ReductionOperation op)
369 Window in_window(window);
370 Window out_window(window);
372 in_window.set(
Window::DimZ, Window::Dimension(0, 1, 1));
373 out_window.set(
Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
375 f(in_window, out_window,
input, output, 2, op);
377 static void reduceW(
const Window &window,
const ITensor *
input, ITensor *output, F f,
const ReductionOperation op)
380 Window in_window(window);
381 Window out_window(window);
383 in_window.set(3, Window::Dimension(0, 1, 1));
384 out_window.set(3, Window::Dimension(0, 1, 1));
386 f(in_window, out_window,
input, output, 3, op);
390 template <
typename T,
int S>
394 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
396 inline void operator()(
397 const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
const ReductionOperation op)
399 const size_t input_dim_0 = in->info()->dimension(0);
400 const int window_step_x = 16 /
sizeof(T);
401 const auto window_start_x =
static_cast<int>(in_window.x().start());
402 const auto window_end_x =
static_cast<int>(in_window.x().end());
404 Window in_win_no_pad = in_window;
405 in_win_no_pad.set(
Window::DimX, Window::Dimension(0, 1, 1));
407 Iterator
input(in, in_win_no_pad);
408 Iterator output(out, out_window);
412 [&](
const Coordinates &)
414 const auto input_ptr =
reinterpret_cast<const T *
>(
input.ptr());
416 auto init_res_value =
static_cast<T
>(0.f);
424 init_res_value =
static_cast<T
>(*input_ptr);
429 init_res_value =
static_cast<T
>(1.f);
436 uint32x4x4_t vec_res_idx{{0}};
439 int x = window_start_x;
440 for (; x <= (window_end_x - window_step_x); x += window_step_x)
457 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
458 vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
460 vec_res_value = temp_vec_res_value;
465 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
466 vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
468 vec_res_value = temp_vec_res_value;
492 #ifdef ARM_COMPUTE_DEBUG_ENABLED
493 auto res =
static_cast<T
>(0.f);
494 for (
int i = 0; i < S; ++i)
498 #else // ARM_COMPUTE_DEBUG_ENABLED
501 for (
int i = 0; i < S / 4; ++i)
506 #endif // ARM_COMPUTE_DEBUG_ENABLED
510 for (; x < window_end_x; ++x)
512 res += (*(input_ptr + x)) * (*(input_ptr + x));
518 for (; x < window_end_x; ++x)
520 res += *(input_ptr + x);
529 *(
reinterpret_cast<T *
>(output.ptr())) = res;
537 for (
int i = 0; i < S / 2; ++i)
543 for (; x < window_end_x; ++x)
545 res *= *(input_ptr + x);
548 *(
reinterpret_cast<T *
>(output.ptr())) = res;
553 auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
557 for (; x < window_end_x; ++x)
559 if (*(input_ptr + x) < res)
562 res = *(input_ptr + x);
565 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
570 auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
574 for (; x < window_end_x; ++x)
576 if (*(input_ptr + x) > res)
579 res = *(input_ptr + x);
582 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
590 for (; x < window_end_x; ++x)
592 res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
594 *(
reinterpret_cast<T *
>(output.ptr())) = res;
602 for (; x < window_end_x; ++x)
604 res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
606 *(
reinterpret_cast<T *
>(output.ptr())) = res;
617 template <
typename T>
618 struct RedOpX_quantized
620 inline void operator()(
621 const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
const ReductionOperation op)
625 const auto oq_info = out->info()->quantization_info().uniform();
627 const TensorInfo in_info = *(in->info());
628 const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
630 const int window_step_x = 16 /
sizeof(T);
631 const auto window_start_x =
static_cast<int>(in_window.x().start());
632 const auto window_end_x =
static_cast<int>(in_window.x().end());
634 Window in_win_no_pad = in_window;
635 in_win_no_pad.set(
Window::DimX, Window::Dimension(0, 1, 1));
637 Iterator
input(in, in_win_no_pad);
638 Iterator output(out, out_window);
640 const auto in_offset =
static_cast<float>(iq_info.offset);
641 const float in_scale = iq_info.scale;
643 const auto out_offset =
static_cast<float>(oq_info.offset);
644 const float out_scale = oq_info.scale;
646 const auto num_elements =
static_cast<float>(in_info.dimension(0));
648 const float A = in_scale / (out_scale * num_elements);
649 const float B = out_offset - (in_scale * in_offset) / (out_scale);
653 [&](
const Coordinates &)
655 const auto input_ptr =
reinterpret_cast<T *
>(
input.ptr());
657 auto vec_res_value1 =
658 wrapper::vdup_n(
static_cast<PromotedType
>(0.f), wrapper::traits::vector_128_tag{});
659 auto vec_res_value2 =
660 wrapper::vdup_n(
static_cast<PromotedType
>(0.f), wrapper::traits::vector_128_tag{});
661 auto vec_res_value3 =
662 wrapper::vdup_n(
static_cast<PromotedType
>(0.f), wrapper::traits::vector_128_tag{});
663 auto vec_res_value4 =
664 wrapper::vdup_n(
static_cast<PromotedType
>(0.f), wrapper::traits::vector_128_tag{});
666 auto vec_res_value1_f = vdupq_n_f32(
static_cast<float>(1.f));
667 auto vec_res_value2_f = vdupq_n_f32(
static_cast<float>(1.f));
668 auto vec_res_value3_f = vdupq_n_f32(
static_cast<float>(1.f));
669 auto vec_res_value4_f = vdupq_n_f32(
static_cast<float>(1.f));
676 vec_res_value =
wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
679 uint32x4x4_t vec_res_idx{{0}};
681 int x = window_start_x;
682 for (; x <= (window_end_x - window_step_x); x += window_step_x)
706 const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
707 const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
717 auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
718 auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
719 auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
720 auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
723 temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
724 temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
725 temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
726 temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
728 vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
729 vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
730 vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
731 vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
736 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
737 vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(
738 x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
739 vec_res_value = temp_vec_res_value;
744 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
745 vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(
746 x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
747 vec_res_value = temp_vec_res_value;
770 calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
774 for (; x < window_end_x; ++x)
776 if (*(input_ptr + x) < res)
779 res = *(input_ptr + x);
782 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
788 calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
792 for (; x < window_end_x; ++x)
794 if (*(input_ptr + x) > res)
797 res = *(input_ptr + x);
800 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
808 for (; x < window_end_x; ++x)
810 res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
812 *(
reinterpret_cast<T *
>(output.ptr())) = res;
820 for (; x < window_end_x; ++x)
822 res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
824 *(
reinterpret_cast<T *
>(output.ptr())) = res;
829 auto carry_res =
wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
839 for (; x < window_end_x; ++x)
842 if (std::is_same<T, uint8_t>::value)
853 if (std::is_same<T, uint8_t>::value)
862 *
reinterpret_cast<T *
>(output.ptr()) =
static_cast<T
>(res);
868 auto carry_res =
wrapper::vadd(vec_res_value1, vec_res_value2);
872 auto carry_paddition =
874 carry_paddition =
wrapper::vpadd(carry_paddition, carry_paddition);
878 for (; x < window_end_x; ++x)
880 res += *(input_ptr + x);
885 const int32_t resFinal =
A * (
static_cast<float>(res)) +
B;
887 *
reinterpret_cast<T *
>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
892 res -= (in_info.dimension(0) - 1) * iq_info.offset;
893 *
reinterpret_cast<T *
>(output.ptr()) = utils::cast::saturate_cast<T>(res);
906 template <
typename T,
int S>
910 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
913 inline void operator()(
const Window &in_window,
920 const TensorInfo in_info = *(in->info());
921 const int window_step_x = 16 /
sizeof(T);
922 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
923 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
925 const auto window_start_x =
static_cast<int>(0);
926 const auto window_end_x =
static_cast<int>(in_window.shape().x());
928 Window in_win_no_pad = in_window;
929 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
930 Window out_win_no_pad = out_window;
932 Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
934 Iterator
input(in, in_win_no_pad);
935 Iterator output(out, out_win_no_pad);
939 [&](
const Coordinates &)
941 const auto input_ptr =
reinterpret_cast<T *
>(
input.ptr());
944 int x = window_start_x;
945 for (; x <= (window_end_x - window_step_x); x += window_step_x)
947 neon_vector vec_res_value = {0};
969 uint32x4x4_t vec_res_idx{{0}};
971 for (
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
974 reinterpret_cast<T *
>(
input.ptr() + x *
sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
990 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
992 calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
993 vec_res_value = temp_vec_res_value;
998 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
1000 calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1001 vec_res_value = temp_vec_res_value;
1021 auto vec_width_inv =
1023 vec_res_value =
wrapper::vmul(vec_res_value, vec_width_inv);
1028 wrapper::vstore(
reinterpret_cast<uint32_t *
>(output.ptr()) + x, vec_res_idx.val[0]);
1029 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1030 if (std::is_same<T, float16_t>::value)
1032 wrapper::vstore(
reinterpret_cast<uint32_t *
>(output.ptr()) + x + 4, vec_res_idx.val[1]);
1034 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1038 wrapper::vstore(
reinterpret_cast<T *
>(output.ptr() + x *
sizeof(T)), vec_res_value);
1043 for (; x < window_end_x; ++x)
1045 auto res_value = 0.f;
1053 res_value = *(input_ptr + x);
1058 res_value =
static_cast<T
>(1.f);
1063 res_value =
static_cast<T
>(0.f);
1068 uint32_t res_idx = 0;
1069 for (
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1072 reinterpret_cast<T *
>(
input.ptr() + x *
sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
1078 res_value += *in_ptr;
1081 res_value += *in_ptr * *in_ptr;
1084 res_value *= *in_ptr;
1088 if (*in_ptr < res_value)
1090 res_value = *in_ptr;
1097 if (*in_ptr > res_value)
1099 res_value = *in_ptr;
1106 res_value = *in_ptr < res_value ? *in_ptr : res_value;
1111 res_value = *in_ptr > res_value ? *in_ptr : res_value;
1121 res_value /= in_info.dimension(axis);
1126 *(
reinterpret_cast<uint32_t *
>(output.ptr()) + x) = res_idx;
1130 *(
reinterpret_cast<T *
>(output.ptr() + x *
sizeof(T))) = res_value;
1138 template <
typename T,
int S,
int axis, ReductionOperation op>
1139 struct RedOpYZW_complex
1142 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
1145 inline void operator()(
1146 const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int,
const ReductionOperation)
1151 const TensorInfo in_info = *(in->info());
1152 const size_t stride_z = in_info.strides_in_bytes()[axis];
1153 const int window_step_x = 16 /
sizeof(T);
1154 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
1155 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
1157 const auto window_start_x =
static_cast<int>(0);
1158 const auto window_end_x =
static_cast<int>(in_window.shape().x());
1160 Window in_win_no_pad = in_window;
1161 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1162 Window out_win_no_pad = out_window;
1164 Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1166 Iterator
input(in, in_win_no_pad);
1167 Iterator output(out, out_win_no_pad);
1171 [&](
const Coordinates &)
1174 int x = window_start_x;
1175 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1177 neon_vector vec_res_value_0 = {0};
1178 neon_vector vec_res_value_1 = {0};
1180 vec_res_value_0 =
wrapper::vdup_n(
static_cast<T
>(0.f), ExactTagType{});
1181 vec_res_value_1 =
wrapper::vdup_n(
static_cast<T
>(0.f), ExactTagType{});
1183 T *out_ptr =
reinterpret_cast<T *
>(output.ptr() + 2 * x *
sizeof(T));
1184 for (
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1186 T *in_ptr_0 =
reinterpret_cast<T *
>(
input.ptr() + 2 * x *
sizeof(T) + stride_z * dim);
1187 T *in_ptr_1 =
reinterpret_cast<T *
>(
input.ptr() + 2 * x *
sizeof(T) + 16 + stride_z * dim);
1192 vec_res_value_0 =
wrapper::vadd(vec_elements_0, vec_res_value_0);
1193 vec_res_value_1 =
wrapper::vadd(vec_elements_1, vec_res_value_1);
1201 for (; x < window_end_x; ++x)
1203 auto res_value_0 = 0.f;
1204 auto res_value_1 = 0.f;
1206 T *out_ptr =
reinterpret_cast<T *
>(output.ptr() + 2 * x *
sizeof(T));
1207 for (
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1209 T *in_ptr =
reinterpret_cast<T *
>(
input.ptr() + 2 * x *
sizeof(T) + stride_z * dim);
1210 res_value_0 += *in_ptr;
1211 res_value_1 += *(in_ptr + 1);
1213 *out_ptr = res_value_0;
1214 *(out_ptr + 1) = res_value_1;
1221 template <
typename T>
1222 struct RedOpYZW_quantized
1224 inline void operator()(
const Window &in_window,
1231 const TensorInfo in_info = *(in->info());
1232 const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
1235 const auto oq_info = out->info()->quantization_info().uniform();
1237 const int window_step_x = 16 /
sizeof(T);
1238 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
1239 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
1241 const auto window_start_x =
static_cast<int>(0);
1242 const auto window_end_x =
static_cast<int>(in_window.shape().x());
1244 Window in_win_no_pad = in_window;
1245 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1246 Window out_win_no_pad = out_window;
1248 Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1250 Iterator
input(in, in_win_no_pad);
1251 Iterator output(out, out_win_no_pad);
1257 vector_type vec_res_value1{};
1258 vector_type vec_res_value2{};
1259 vector_type vec_res_value3{};
1260 vector_type vec_res_value4{};
1262 vector_type_f vec_res_value1_f{};
1263 vector_type_f vec_res_value2_f{};
1264 vector_type_f vec_res_value3_f{};
1265 vector_type_f vec_res_value4_f{};
1267 const float in_offset =
static_cast<float>(iq_info.offset);
1268 const float in_scale = iq_info.scale;
1270 const float out_offset =
static_cast<float>(oq_info.offset);
1271 const float out_scale = oq_info.scale;
1273 const float num_elements =
static_cast<float>(in_info.dimension(axis));
1275 const float A = in_scale / (out_scale * num_elements);
1276 const float B = out_offset - (in_scale * in_offset) / (out_scale);
1278 const auto vec_A =
wrapper::vdup_n(
static_cast<float>(A), wrapper::traits::vector_128_tag{});
1279 const auto vec_B =
wrapper::vdup_n(
static_cast<float>(B), wrapper::traits::vector_128_tag{});
1283 [&](
const Coordinates &)
1285 const auto input_ptr =
reinterpret_cast<T *
>(
input.ptr());
1288 int x = window_start_x;
1289 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1291 uint32x4x4_t vec_res_idx{{0}};
1292 vec_res_value1 =
wrapper::vdup_n(
static_cast<PromotedType
>(0), wrapper::traits::vector_128_tag{});
1293 vec_res_value2 =
wrapper::vdup_n(
static_cast<PromotedType
>(0), wrapper::traits::vector_128_tag{});
1294 vec_res_value3 =
wrapper::vdup_n(
static_cast<PromotedType
>(0), wrapper::traits::vector_128_tag{});
1295 vec_res_value4 =
wrapper::vdup_n(
static_cast<PromotedType
>(0), wrapper::traits::vector_128_tag{});
1297 vec_res_value1_f =
wrapper::vdup_n(
static_cast<float>(1), wrapper::traits::vector_128_tag{});
1298 vec_res_value2_f =
wrapper::vdup_n(
static_cast<float>(1), wrapper::traits::vector_128_tag{});
1299 vec_res_value3_f =
wrapper::vdup_n(
static_cast<float>(1), wrapper::traits::vector_128_tag{});
1300 vec_res_value4_f =
wrapper::vdup_n(
static_cast<float>(1), wrapper::traits::vector_128_tag{});
1304 for (
unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
1306 const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
1321 vec_res_value1 =
wrapper::vadd(temp32x4t_1, vec_res_value1);
1322 vec_res_value2 =
wrapper::vadd(temp32x4t_2, vec_res_value2);
1323 vec_res_value3 =
wrapper::vadd(temp32x4t_3, vec_res_value3);
1324 vec_res_value4 =
wrapper::vadd(temp32x4t_4, vec_res_value4);
1329 const auto offset32x4f_4 =
wrapper::vdup_n(
static_cast<float>(iq_info.offset),
1330 wrapper::traits::vector_128_tag{});
1331 const auto scale32x4f_4 =
1342 auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
1343 auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
1344 auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
1345 auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
1353 vec_res_value1_f =
wrapper::vmul(temp32x4f_1, vec_res_value1_f);
1354 vec_res_value2_f =
wrapper::vmul(temp32x4f_2, vec_res_value2_f);
1355 vec_res_value3_f =
wrapper::vmul(temp32x4f_3, vec_res_value3_f);
1356 vec_res_value4_f =
wrapper::vmul(temp32x4f_4, vec_res_value4_f);
1361 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
1362 vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
1363 vec_res_idx, op, axis);
1364 vec_res_value = temp_vec_res_value;
1369 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
1370 vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
1371 vec_res_idx, op, axis);
1372 vec_res_value = temp_vec_res_value;
1395 wrapper::vstore(
reinterpret_cast<uint32_t *
>(output.ptr() + 4 * x), vec_res_idx.val[0]);
1396 wrapper::vstore(
reinterpret_cast<uint32_t *
>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
1397 wrapper::vstore(
reinterpret_cast<uint32_t *
>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
1398 wrapper::vstore(
reinterpret_cast<uint32_t *
>(output.ptr() + 4 * x) + 12,
1399 vec_res_idx.val[3]);
1405 wrapper::vstore(
reinterpret_cast<T *
>(output.ptr() + x), vec_res_value);
1411 auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
1418 vec_res_s_value1 =
wrapper::vsub(vec_res_s_value1, offsets);
1419 vec_res_s_value2 =
wrapper::vsub(vec_res_s_value2, offsets);
1420 vec_res_s_value3 =
wrapper::vsub(vec_res_s_value3, offsets);
1421 vec_res_s_value4 =
wrapper::vsub(vec_res_s_value4, offsets);
1423 const auto temp16x8t_1 =
1425 const auto temp16x8t_2 =
1428 combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
1433 vec_res_value1_f =
wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
1434 vec_res_value2_f =
wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
1435 vec_res_value3_f =
wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
1436 vec_res_value4_f =
wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
1439 vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
1440 vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
1441 vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
1442 vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
1443 #else // defined(__aarch64__)
1444 vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
1445 vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
1446 vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
1447 vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
1448 #endif // __aarch64__
1450 const auto temp16x8t_1 =
1452 const auto temp16x8t_2 =
1461 const auto offset32x4f_4 =
1462 wrapper::vdup_n(
static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1463 const auto iscale32x4f_4 =
vinvq_f32(vdupq_n_f32(iq_info.scale));
1475 vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1476 vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1477 vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1478 vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1480 const auto temp16x8t_1 =
1482 const auto temp16x8t_2 =
1495 for (; x < window_end_x; ++x)
1497 float res_value = 0.f;
1498 int32_t res_value_q = 0;
1507 res_value = *(input_ptr + x);
1512 res_value =
static_cast<T
>(1.0f);
1517 res_value =
static_cast<T
>(0.0f);
1521 uint32_t res_idx = 0;
1523 for (
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1526 reinterpret_cast<T *
>(
input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
1531 res_value += *in_ptr;
1536 res_value_q += *in_ptr;
1541 res_value += *in_ptr * *in_ptr;
1547 if (std::is_same<T, uint8_t>::value)
1559 if (*in_ptr < res_value)
1561 res_value = *in_ptr;
1568 if (*in_ptr > res_value)
1570 res_value = *in_ptr;
1577 res_value = *in_ptr < res_value ? *in_ptr : res_value;
1582 res_value = *in_ptr > res_value ? *in_ptr : res_value;
1598 #else // defined(__aarch64__)
1599 const int32_t res =
A * (
static_cast<float>(res_value_q)) +
B;
1600 #endif // __aarch64__
1601 *
reinterpret_cast<T *
>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
1607 res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
1608 *
reinterpret_cast<T *
>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
1615 if (std::is_same<T, uint8_t>::value)
1623 *(
reinterpret_cast<T *
>(output.ptr() + x)) = res;
1629 *(
reinterpret_cast<uint32_t *
>(output.ptr() + x * 4)) = res_idx;
1633 *(
reinterpret_cast<T *
>(output.ptr() + x)) = res_value;
1644 const bool is_complex = (
input->info()->num_channels() == 2);
1651 switch (
input->info()->data_type())
1657 return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
1658 window,
input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(),
1676 switch (
input->info()->data_type())
1680 return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window,
input, output,
1681 RedOpX_quantized<uint8_t>(), op);
1685 return Reducer<RedOpX_quantized<int8_t>>::reduceX(window,
input, output, RedOpX_quantized<int8_t>(),
1688 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1690 return Reducer<RedOpX<float16_t, 8>>::reduceX(window,
input, output, RedOpX<float16_t, 8>(), op);
1691 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1694 return Reducer<RedOpX<float, 4>>::reduceX(window,
input, output, RedOpX<float, 4>(), op);
1698 return Reducer<RedOpX<int32_t, 4>>::reduceX(window,
input, output, RedOpX<int32_t, 4>(), op);
1707 switch (
input->info()->data_type())
1711 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window,
input, output,
1712 RedOpYZW_quantized<uint8_t>(), op);
1716 return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window,
input, output,
1717 RedOpYZW_quantized<int8_t>(), op);
1719 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1721 return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window,
input, output, RedOpYZW<float16_t, 8>(),
1723 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1725 return Reducer<RedOpYZW<float, 4>>::reduceY(window,
input, output, RedOpYZW<float, 4>(), op);
1727 return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window,
input, output, RedOpYZW<int32_t, 4>(), op);
1732 switch (
input->info()->data_type())
1735 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window,
input, output,
1736 RedOpYZW_quantized<uint8_t>(), op);
1738 return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window,
input, output,
1739 RedOpYZW_quantized<int8_t>(), op);
1740 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1742 return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window,
input, output, RedOpYZW<float16_t, 8>(),
1744 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1746 return Reducer<RedOpYZW<float, 4>>::reduceZ(window,
input, output, RedOpYZW<float, 4>(), op);
1748 return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window,
input, output, RedOpYZW<int32_t, 4>(), op);
1753 switch (
input->info()->data_type())
1756 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window,
input, output,
1757 RedOpYZW_quantized<uint8_t>(), op);
1759 return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window,
input, output,
1760 RedOpYZW_quantized<int8_t>(), op);
1761 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1763 return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window,
input, output, RedOpYZW<float16_t, 8>(),
1765 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1767 return Reducer<RedOpYZW<float, 4>>::reduceW(window,
input, output, RedOpYZW<float, 4>(), op);
1769 return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window,
input, output, RedOpYZW<int32_t, 4>(), op);
1785 if (
input->num_channels() == 1)
1798 "Reduction axis greater than max number of dimensions");
1801 if (output->total_size() != 0)
1804 if (!is_arg_min_max)
1816 const TensorInfo tensor_info_reshaped =
input->clone()->set_tensor_shape(
output_shape);
1841 _reduction_axis = axis;
1845 INEKernel::configure(win);
1856 .set_data_type(output_data_type)
1858 .set_is_resizable(
true));
1877 reduce_op(
window, _input, _output, _reduction_axis, _op);