49 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output,
int offset = 0)
51 if(std::is_same<T, uint8_t>::value)
64 uint32x4x4_t calculate_index(uint32_t idx, T a, T
b, uint32x4x4_t c,
ReductionOperation op,
int axis)
76 uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
81 uint32x4x4_t res = { {
wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
87 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c,
ReductionOperation op,
int axis)
89 uint32x4x4_t mask{ { 0 } };
90 uint8x16_t mask_u8{ 0 };
106 uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
107 { idx + 4, idx + 5, idx + 6, idx + 7 },
108 { idx + 8, idx + 9, idx + 10, idx + 11 },
109 { idx + 12, idx + 13, idx + 14, idx + 15 }
114 vec_idx.val[0] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
115 vec_idx.val[1] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
116 vec_idx.val[2] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
117 vec_idx.val[3] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
122 vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
123 vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
124 vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
125 vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
133 template <
typename T>
134 inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
135 typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>
::type >
::type 143 template <
typename T>
144 inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
145 typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>
::type >
::type 155 template <
typename T>
156 inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
157 typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>
::type >
::type 165 template <
typename T>
166 inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
167 typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>
::type >
::type 176 template <
typename T>
177 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value,
ReductionOperation op)
179 uint32x4_t res_idx_mask{ 0 };
180 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
184 auto pmin = calculate_min(vec_res_value);
190 auto pmax = calculate_max(vec_res_value);
200 return (res - 0xFFFFFFFF);
203 template <
typename T>
204 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value,
ReductionOperation op)
206 uint32x4x4_t res_idx_mask{ { 0 } };
207 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
208 uint8x16_t mask_u8{ 0 };
211 auto pmin = calculate_min(vec_res_value);
216 auto pmax = calculate_max(vec_res_value);
227 res_idx_mask.val[0] =
wrapper::vand(vec_res_idx.val[0], wide_u32_1);
228 res_idx_mask.val[1] =
wrapper::vand(vec_res_idx.val[1], wide_u32_2);
229 res_idx_mask.val[2] =
wrapper::vand(vec_res_idx.val[2], wide_u32_3);
230 res_idx_mask.val[3] =
wrapper::vand(vec_res_idx.val[3], wide_u32_4);
231 res_idx_mask.val[0] =
wrapper::vadd(res_idx_mask.val[0], mask_ones);
232 res_idx_mask.val[1] =
wrapper::vadd(res_idx_mask.val[1], mask_ones);
233 res_idx_mask.val[2] =
wrapper::vadd(res_idx_mask.val[2], mask_ones);
234 res_idx_mask.val[3] =
wrapper::vadd(res_idx_mask.val[3], mask_ones);
236 uint32_t res = 0xFFFFFFFF;
247 return (res - 0xFFFFFFFF);
250 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 252 uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c,
ReductionOperation op,
int axis)
254 uint32x4x2_t mask{ 0 };
255 uint16x8_t mask_u16{ 0 };
266 uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
267 { idx + 4, idx + 5, idx + 6, idx + 7 }
272 vec_idx.val[0] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
273 vec_idx.val[1] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
275 uint32x4x4_t res = {
wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
284 inline float16x4_t calculate_min(float16x8_t in)
291 inline float16x4_t calculate_max(float16x8_t in)
299 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value,
ReductionOperation op)
301 uint32x4x2_t res_idx_mask{ 0 };
302 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
306 auto pmin = calculate_min(vec_res_value);
311 auto pmax = calculate_max(vec_res_value);
318 res_idx_mask.val[0] =
wrapper::vand(vec_res_idx.val[0], wide_u32_1);
319 res_idx_mask.val[1] =
wrapper::vand(vec_res_idx.val[1], wide_u32_2);
320 res_idx_mask.val[0] =
wrapper::vadd(res_idx_mask.val[0], mask_ones);
321 res_idx_mask.val[1] =
wrapper::vadd(res_idx_mask.val[1], mask_ones);
323 uint32_t res = 0xFFFFFFFF;
334 return (res - 0xFFFFFFFF);
336 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 342 static void reduceX(
const Window &window,
const ITensor *
input, ITensor *output, F f,
const ReductionOperation op)
345 Window out_window(window);
346 out_window.set(
Window::DimX, Window::Dimension(0, 1, 1));
348 f(window, out_window, input, output, op);
350 static void reduceY(
const Window &window,
const ITensor *input, ITensor *output, F f,
const ReductionOperation op)
353 Window in_window(window);
354 Window out_window(window);
356 in_window.set(
Window::DimY, Window::Dimension(0, 1, 1));
357 out_window.set(
Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
359 f(in_window, out_window, input, output, 1, op);
361 static void reduceZ(
const Window &window,
const ITensor *input, ITensor *output, F f,
const ReductionOperation op)
364 Window in_window(window);
365 Window out_window(window);
367 in_window.set(
Window::DimZ, Window::Dimension(0, 1, 1));
368 out_window.set(
Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
370 f(in_window, out_window, input, output, 2, op);
372 static void reduceW(
const Window &window,
const ITensor *input, ITensor *output, F f,
const ReductionOperation op)
375 Window in_window(window);
376 Window out_window(window);
378 in_window.set(3, Window::Dimension(0, 1, 1));
379 out_window.set(3, Window::Dimension(0, 1, 1));
381 f(in_window, out_window, input, output, 3, op);
385 template <
typename T,
int S>
389 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
391 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
const ReductionOperation op)
393 const size_t input_dim_0 = in->info()->dimension(0);
394 const int window_step_x = 16 /
sizeof(T);
395 const auto window_start_x =
static_cast<int>(in_window.x().start());
396 const auto window_end_x =
static_cast<int>(in_window.x().end());
398 Window in_win_no_pad = in_window;
399 in_win_no_pad.set(
Window::DimX, Window::Dimension(0, 1, 1));
401 Iterator
input(in, in_win_no_pad);
402 Iterator output(out, out_window);
406 const auto input_ptr =
reinterpret_cast<const T *
>(input.ptr());
408 auto init_res_value =
static_cast<T
>(0.f);
416 init_res_value =
static_cast<T
>(*input_ptr);
421 init_res_value =
static_cast<T
>(1.f);
428 uint32x4x4_t vec_res_idx{ { 0 } };
431 int x = window_start_x;
432 for(; x <= (window_end_x - window_step_x); x += window_step_x)
449 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
450 vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
451 vec_res_value = temp_vec_res_value;
456 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
457 vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
458 vec_res_value = temp_vec_res_value;
482 #ifdef ARM_COMPUTE_DEBUG_ENABLED 483 auto res =
static_cast<T
>(0.f);
484 for(
int i = 0; i < S; ++i)
488 #else // ARM_COMPUTE_DEBUG_ENABLED 490 for(
int i = 0; i < S / 4; ++i)
495 #endif // ARM_COMPUTE_DEBUG_ENABLED 499 for(; x < window_end_x; ++x)
501 res += (*(input_ptr + x)) * (*(input_ptr + x));
507 for(; x < window_end_x; ++x)
509 res += *(input_ptr + x);
518 *(
reinterpret_cast<T *
>(output.ptr())) = res;
525 for(
int i = 0; i < S / 2; ++i)
531 for(; x < window_end_x; ++x)
533 res *= *(input_ptr + x);
536 *(
reinterpret_cast<T *
>(output.ptr())) = res;
541 auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
545 for(; x < window_end_x; ++x)
547 if(*(input_ptr + x) < res)
550 res = *(input_ptr + x);
553 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
558 auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
562 for(; x < window_end_x; ++x)
564 if(*(input_ptr + x) > res)
567 res = *(input_ptr + x);
570 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
578 for(; x < window_end_x; ++x)
580 res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
582 *(
reinterpret_cast<T *
>(output.ptr())) = res;
590 for(; x < window_end_x; ++x)
592 res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
594 *(
reinterpret_cast<T *
>(output.ptr())) = res;
605 template <
typename T>
606 struct RedOpX_quantized
608 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
const ReductionOperation op)
612 const TensorInfo in_info = *(in->info());
613 const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
615 const int window_step_x = 16 /
sizeof(T);
616 const auto window_start_x =
static_cast<int>(in_window.x().start());
617 const auto window_end_x =
static_cast<int>(in_window.x().end());
619 Window in_win_no_pad = in_window;
620 in_win_no_pad.set(
Window::DimX, Window::Dimension(0, 1, 1));
622 Iterator
input(in, in_win_no_pad);
623 Iterator output(out, out_window);
627 const auto input_ptr =
reinterpret_cast<T *
>(input.ptr());
629 auto vec_res_value1 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
630 auto vec_res_value2 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
631 auto vec_res_value3 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
632 auto vec_res_value4 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
634 auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
635 auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
636 auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
637 auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
643 vec_res_value =
wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
646 uint32x4x4_t vec_res_idx{ { 0 } };
648 int x = window_start_x;
649 for(; x <= (window_end_x - window_step_x); x += window_step_x)
673 const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
674 const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
684 auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
685 auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
686 auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
687 auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
690 temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
691 temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
692 temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
693 temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
695 vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
696 vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
697 vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
698 vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
703 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
704 vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
705 vec_res_value = temp_vec_res_value;
710 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
711 vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
712 vec_res_value = temp_vec_res_value;
734 auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
738 for(; x < window_end_x; ++x)
740 if(*(input_ptr + x) < res)
743 res = *(input_ptr + x);
746 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
751 auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
755 for(; x < window_end_x; ++x)
757 if(*(input_ptr + x) > res)
760 res = *(input_ptr + x);
763 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
771 for(; x < window_end_x; ++x)
773 res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
775 *(
reinterpret_cast<T *
>(output.ptr())) = res;
783 for(; x < window_end_x; ++x)
785 res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
787 *(
reinterpret_cast<T *
>(output.ptr())) = res;
792 auto carry_res =
wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
802 for(; x < window_end_x; ++x)
805 if(std::is_same<T, uint8_t>::value)
816 if(std::is_same<T, uint8_t>::value)
825 *
reinterpret_cast<T *
>(output.ptr()) = static_cast<T>(res);
831 auto carry_res =
wrapper::vadd(vec_res_value1, vec_res_value2);
836 carry_paddition =
wrapper::vpadd(carry_paddition, carry_paddition);
840 for(; x < window_end_x; ++x)
842 res += *(input_ptr + x);
847 res /=
static_cast<int32_t
>(in_info.dimension(0));
852 res -= (in_info.dimension(0) - 1) * iq_info.offset;
854 *
reinterpret_cast<T *
>(output.ptr()) = utils::cast::saturate_cast<T>(res);
865 template <
typename T,
int S>
869 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
872 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int axis,
const ReductionOperation op)
874 const TensorInfo in_info = *(in->info());
875 const int window_step_x = 16 /
sizeof(T);
876 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
877 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
879 const auto window_start_x =
static_cast<int>(0);
880 const auto window_end_x =
static_cast<int>(in_window.shape().x());
882 Window in_win_no_pad = in_window;
883 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
884 Window out_win_no_pad = out_window;
885 out_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
887 Iterator
input(in, in_win_no_pad);
888 Iterator output(out, out_win_no_pad);
892 const auto input_ptr =
reinterpret_cast<T *
>(input.ptr());
895 int x = window_start_x;
896 for(; x <= (window_end_x - window_step_x); x += window_step_x)
898 neon_vector vec_res_value = { 0 };
920 uint32x4x4_t vec_res_idx{ { 0 } };
922 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
924 const T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + x *
sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
940 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
941 vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
942 vec_res_value = temp_vec_res_value;
947 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
948 vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
949 vec_res_value = temp_vec_res_value;
975 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
976 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 977 if(std::is_same<T, float16_t>::value)
979 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
981 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 985 wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x *
sizeof(T)), vec_res_value);
990 for(; x < window_end_x; ++x)
992 auto res_value = 0.f;
1000 res_value = *(input_ptr + x);
1005 res_value =
static_cast<T
>(1.f);
1010 res_value =
static_cast<T
>(0.f);
1015 uint32_t res_idx = 0;
1016 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1018 const T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + x *
sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
1024 res_value += *in_ptr;
1027 res_value += *in_ptr * *in_ptr;
1030 res_value *= *in_ptr;
1034 if(*in_ptr < res_value)
1036 res_value = *in_ptr;
1043 if(*in_ptr > res_value)
1045 res_value = *in_ptr;
1052 res_value = *in_ptr < res_value ? *in_ptr : res_value;
1057 res_value = *in_ptr > res_value ? *in_ptr : res_value;
1067 res_value /= in_info.dimension(axis);
1072 *(
reinterpret_cast<uint32_t *
>(output.ptr()) + x) = res_idx;
1076 *(
reinterpret_cast<T *
>(output.ptr() + x *
sizeof(T))) = res_value;
1084 template <
typename T,
int S,
int axis, ReductionOperation op>
1085 struct RedOpYZW_complex
1088 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
1091 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int,
const ReductionOperation)
1096 const TensorInfo in_info = *(in->info());
1097 const size_t stride_z = in_info.strides_in_bytes()[axis];
1098 const int window_step_x = 16 /
sizeof(T);
1099 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
1100 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
1102 const auto window_start_x =
static_cast<int>(0);
1103 const auto window_end_x =
static_cast<int>(in_window.shape().x());
1105 Window in_win_no_pad = in_window;
1106 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1107 Window out_win_no_pad = out_window;
1108 out_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1110 Iterator
input(in, in_win_no_pad);
1111 Iterator output(out, out_win_no_pad);
1116 int x = window_start_x;
1117 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1119 neon_vector vec_res_value_0 = { 0 };
1120 neon_vector vec_res_value_1 = { 0 };
1122 vec_res_value_0 =
wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
1123 vec_res_value_1 =
wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
1125 T *out_ptr =
reinterpret_cast<T *
>(output.ptr() + 2 * x *
sizeof(T));
1126 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1128 T *in_ptr_0 =
reinterpret_cast<T *
>(input.ptr() + 2 * x *
sizeof(T) + stride_z * dim);
1129 T *in_ptr_1 =
reinterpret_cast<T *
>(input.ptr() + 2 * x *
sizeof(T) + 16 + stride_z * dim);
1134 vec_res_value_0 =
wrapper::vadd(vec_elements_0, vec_res_value_0);
1135 vec_res_value_1 =
wrapper::vadd(vec_elements_1, vec_res_value_1);
1143 for(; x < window_end_x; ++x)
1145 auto res_value_0 = 0.f;
1146 auto res_value_1 = 0.f;
1148 T *out_ptr =
reinterpret_cast<T *
>(output.ptr() + 2 * x *
sizeof(T));
1149 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1151 T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + 2 * x *
sizeof(T) + stride_z * dim);
1152 res_value_0 += *in_ptr;
1153 res_value_1 += *(in_ptr + 1);
1155 *out_ptr = res_value_0;
1156 *(out_ptr + 1) = res_value_1;
1163 template <
typename T>
1164 struct RedOpYZW_quantized
1166 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int axis,
const ReductionOperation op)
1168 const TensorInfo in_info = *(in->info());
1169 const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
1172 const int window_step_x = 16 /
sizeof(T);
1173 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
1174 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
1176 const auto window_start_x =
static_cast<int>(0);
1177 const auto window_end_x =
static_cast<int>(in_window.shape().x());
1179 Window in_win_no_pad = in_window;
1180 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1181 Window out_win_no_pad = out_window;
1182 out_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1184 Iterator
input(in, in_win_no_pad);
1185 Iterator output(out, out_win_no_pad);
1190 vector_type vec_res_value1{};
1191 vector_type vec_res_value2{};
1192 vector_type vec_res_value3{};
1193 vector_type vec_res_value4{};
1195 vector_type_f vec_res_value1_f{};
1196 vector_type_f vec_res_value2_f{};
1197 vector_type_f vec_res_value3_f{};
1198 vector_type_f vec_res_value4_f{};
1202 const auto input_ptr =
reinterpret_cast<T *
>(input.ptr());
1205 int x = window_start_x;
1206 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1208 uint32x4x4_t vec_res_idx{ { 0 } };
1209 vec_res_value1 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1210 vec_res_value2 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1211 vec_res_value3 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1212 vec_res_value4 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1214 vec_res_value1_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1215 vec_res_value2_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1216 vec_res_value3_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1217 vec_res_value4_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1221 for(
unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
1223 const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
1238 vec_res_value1 =
wrapper::vadd(temp32x4t_1, vec_res_value1);
1239 vec_res_value2 =
wrapper::vadd(temp32x4t_2, vec_res_value2);
1240 vec_res_value3 =
wrapper::vadd(temp32x4t_3, vec_res_value3);
1241 vec_res_value4 =
wrapper::vadd(temp32x4t_4, vec_res_value4);
1246 const auto offset32x4f_4 =
wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1247 const auto scale32x4f_4 =
wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
1257 auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
1258 auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
1259 auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
1260 auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
1268 vec_res_value1_f =
wrapper::vmul(temp32x4f_1, vec_res_value1_f);
1269 vec_res_value2_f =
wrapper::vmul(temp32x4f_2, vec_res_value2_f);
1270 vec_res_value3_f =
wrapper::vmul(temp32x4f_3, vec_res_value3_f);
1271 vec_res_value4_f =
wrapper::vmul(temp32x4f_4, vec_res_value4_f);
1276 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
1277 vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1278 vec_res_value = temp_vec_res_value;
1283 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
1284 vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1285 vec_res_value = temp_vec_res_value;
1308 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
1309 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
1310 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
1311 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
1317 wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
1323 auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
1330 vec_res_s_value1 =
wrapper::vsub(vec_res_s_value1, offsets);
1331 vec_res_s_value2 =
wrapper::vsub(vec_res_s_value2, offsets);
1332 vec_res_s_value3 =
wrapper::vsub(vec_res_s_value3, offsets);
1333 vec_res_s_value4 =
wrapper::vsub(vec_res_s_value4, offsets);
1338 combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
1343 const auto vec_width_inv =
wrapper::vinv(
wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
1344 vec_res_value1_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
1345 vec_res_value2_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
1346 vec_res_value3_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
1347 vec_res_value4_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
1349 vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1350 vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1351 vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1352 vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1363 const auto offset32x4f_4 =
wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1364 const auto iscale32x4f_4 =
vinvq_f32(vdupq_n_f32(iq_info.scale));
1372 vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1373 vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1374 vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1375 vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1390 for(; x < window_end_x; ++x)
1392 float res_value = 0.f;
1400 res_value = *(input_ptr + x);
1405 res_value =
static_cast<T
>(1.0f);
1410 res_value =
static_cast<T
>(0.0f);
1414 uint32_t res_idx = 0;
1416 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1418 const T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
1424 res_value += *in_ptr;
1429 res_value += *in_ptr * *in_ptr;
1435 if(std::is_same<T, uint8_t>::value)
1447 if(*in_ptr < res_value)
1449 res_value = *in_ptr;
1456 if(*in_ptr > res_value)
1458 res_value = *in_ptr;
1465 res_value = *in_ptr < res_value ? *in_ptr : res_value;
1470 res_value = *in_ptr > res_value ? *in_ptr : res_value;
1482 int32_t res =
static_cast<int32_t
>(res_value);
1483 res /=
static_cast<int32_t
>(in_info.dimension(axis));
1484 *
reinterpret_cast<T *
>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
1490 res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
1491 *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
1498 if(std::is_same<T, uint8_t>::value)
1506 *(
reinterpret_cast<T *
>(output.ptr() + x)) = res;
1512 *(
reinterpret_cast<uint32_t *
>(output.ptr() + x * 4)) = res_idx;
1516 *(
reinterpret_cast<T *
>(output.ptr() + x)) = res_value;
1524 void reduce_op(
const Window &window,
const ITensor *input, ITensor *output,
unsigned int axis,
const ReductionOperation op)
1526 const bool is_complex = (input->info()->num_channels() == 2);
1533 switch(input->info()->data_type())
1539 return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
1555 switch(input->info()->data_type())
1558 return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
1560 return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
1561 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1563 return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
1564 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1566 return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
1568 return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
1573 switch(input->info()->data_type())
1576 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1578 return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1579 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1581 return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
1582 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1584 return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
1586 return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
1591 switch(input->info()->data_type())
1594 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1596 return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1597 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1599 return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
1600 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1602 return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
1604 return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
1609 switch(input->info()->data_type())
1612 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1614 return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1615 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1617 return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
1618 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1620 return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
1622 return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
1631 Status validate_arguments(
const ITensorInfo *input,
const ITensorInfo *output,
unsigned int axis,
ReductionOperation op)
1638 if(input->num_channels() == 1)
1652 if(output->total_size() != 0)
1667 const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
1689 _reduction_axis = axis;
1693 INEKernel::configure(win);
1700 auto_init_if_empty(*output->
info(), input->
info()->
clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(
true));
1716 reduce_op(window, _input, _output, _reduction_axis, _op);
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8x8_t vorr(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
float dequantize_qasymm8(uint8_t value, const INFO_TYPE &qinfo)
Dequantize a value given an unsigned 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
ReductionOperation
Available reduction operations.
uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given an unsigned 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
float32x2_t vinv(const float32x2_t &a)
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
NEReductionOperationKernel()
Default constructor.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
Static function to check if given info will lead to a valid configuration of NEReductionOperationKern...
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
int16x4_t vreinterpret(const uint16x4_t &a)
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vand(const uint8x8_t &a, const uint8x8_t &b)
TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims=true)
Calculate the reduced shape of a tensor given an axis.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
uint8x8_t vgetlow(const uint8x16_t val)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
void configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
Set the source, destination of the kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
uint8x8_t vgethigh(const uint8x16_t val)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
float32x4_t vinvq_f32(float32x4_t x)
Calculate reciprocal.
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vclt(const uint8x8_t &a, const uint8x8_t &b)
float dequantize_qasymm8_signed(int8_t value, const INFO_TYPE &qinfo)
Dequantize a value given a signed 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vpmin(const uint8x8_t &a, const uint8x8_t &b)
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
DataType
Available data types.
uint16x8_t vmovl(const uint8x8_t &a)
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Describe a multidimensional execution window.
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
uint32x2_t vqmovun(const int64x2_t &a)