50 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output,
int offset = 0)
52 if(std::is_same<T, uint8_t>::value)
65 uint32x4x4_t calculate_index(uint32_t idx, T a, T
b, uint32x4x4_t c,
ReductionOperation op,
int axis)
77 uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
82 uint32x4x4_t res = { {
wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
88 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c,
ReductionOperation op,
int axis)
90 uint32x4x4_t mask{ { 0 } };
91 uint8x16_t mask_u8{ 0 };
107 uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
108 { idx + 4, idx + 5, idx + 6, idx + 7 },
109 { idx + 8, idx + 9, idx + 10, idx + 11 },
110 { idx + 12, idx + 13, idx + 14, idx + 15 }
115 vec_idx.val[0] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
116 vec_idx.val[1] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
117 vec_idx.val[2] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
118 vec_idx.val[3] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
123 vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
124 vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
125 vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
126 vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
134 template <
typename T>
135 inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
136 typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>
::type >
::type 144 template <
typename T>
145 inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
146 typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>
::type >
::type 156 template <
typename T>
157 inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
158 typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>
::type >
::type 166 template <
typename T>
167 inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
168 typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>
::type >
::type 177 template <
typename T>
178 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value,
ReductionOperation op)
180 uint32x4_t res_idx_mask{ 0 };
181 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
185 auto pmin = calculate_min(vec_res_value);
191 auto pmax = calculate_max(vec_res_value);
201 return (res - 0xFFFFFFFF);
204 template <
typename T>
205 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value,
ReductionOperation op)
207 uint32x4x4_t res_idx_mask{ { 0 } };
208 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
209 uint8x16_t mask_u8{ 0 };
212 auto pmin = calculate_min(vec_res_value);
217 auto pmax = calculate_max(vec_res_value);
228 res_idx_mask.val[0] =
wrapper::vand(vec_res_idx.val[0], wide_u32_1);
229 res_idx_mask.val[1] =
wrapper::vand(vec_res_idx.val[1], wide_u32_2);
230 res_idx_mask.val[2] =
wrapper::vand(vec_res_idx.val[2], wide_u32_3);
231 res_idx_mask.val[3] =
wrapper::vand(vec_res_idx.val[3], wide_u32_4);
232 res_idx_mask.val[0] =
wrapper::vadd(res_idx_mask.val[0], mask_ones);
233 res_idx_mask.val[1] =
wrapper::vadd(res_idx_mask.val[1], mask_ones);
234 res_idx_mask.val[2] =
wrapper::vadd(res_idx_mask.val[2], mask_ones);
235 res_idx_mask.val[3] =
wrapper::vadd(res_idx_mask.val[3], mask_ones);
237 uint32_t res = 0xFFFFFFFF;
248 return (res - 0xFFFFFFFF);
251 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 253 uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c,
ReductionOperation op,
int axis)
255 uint32x4x2_t mask{ 0 };
256 uint16x8_t mask_u16{ 0 };
267 uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
268 { idx + 4, idx + 5, idx + 6, idx + 7 }
273 vec_idx.val[0] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
274 vec_idx.val[1] =
wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
276 uint32x4x4_t res = {
wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
285 inline float16x4_t calculate_min(float16x8_t in)
292 inline float16x4_t calculate_max(float16x8_t in)
300 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value,
ReductionOperation op)
302 uint32x4x2_t res_idx_mask{ 0 };
303 uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
307 auto pmin = calculate_min(vec_res_value);
312 auto pmax = calculate_max(vec_res_value);
319 res_idx_mask.val[0] =
wrapper::vand(vec_res_idx.val[0], wide_u32_1);
320 res_idx_mask.val[1] =
wrapper::vand(vec_res_idx.val[1], wide_u32_2);
321 res_idx_mask.val[0] =
wrapper::vadd(res_idx_mask.val[0], mask_ones);
322 res_idx_mask.val[1] =
wrapper::vadd(res_idx_mask.val[1], mask_ones);
324 uint32_t res = 0xFFFFFFFF;
335 return (res - 0xFFFFFFFF);
337 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 343 static void reduceX(
const Window &window,
const ITensor *
input, ITensor *output, F f,
const ReductionOperation op)
346 Window out_window(window);
347 out_window.set(
Window::DimX, Window::Dimension(0, 1, 1));
349 f(window, out_window, input, output, op);
351 static void reduceY(
const Window &window,
const ITensor *input, ITensor *output, F f,
const ReductionOperation op)
354 Window in_window(window);
355 Window out_window(window);
357 in_window.set(
Window::DimY, Window::Dimension(0, 1, 1));
358 out_window.set(
Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
360 f(in_window, out_window, input, output, 1, op);
362 static void reduceZ(
const Window &window,
const ITensor *input, ITensor *output, F f,
const ReductionOperation op)
365 Window in_window(window);
366 Window out_window(window);
368 in_window.set(
Window::DimZ, Window::Dimension(0, 1, 1));
369 out_window.set(
Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
371 f(in_window, out_window, input, output, 2, op);
373 static void reduceW(
const Window &window,
const ITensor *input, ITensor *output, F f,
const ReductionOperation op)
376 Window in_window(window);
377 Window out_window(window);
379 in_window.set(3, Window::Dimension(0, 1, 1));
380 out_window.set(3, Window::Dimension(0, 1, 1));
382 f(in_window, out_window, input, output, 3, op);
386 template <
typename T,
int S>
390 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
392 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
const ReductionOperation op)
394 const TensorInfo in_info = *(in->info());
395 const int window_step_x = 16 /
sizeof(T);
396 const auto window_start_x =
static_cast<int>(in_window.x().start());
397 const auto window_end_x =
static_cast<int>(in_window.x().end());
399 Window in_win_no_pad = in_window;
400 in_win_no_pad.set(
Window::DimX, Window::Dimension(0, 1, 1));
402 Iterator
input(in, in_win_no_pad);
403 Iterator output(out, out_window);
407 const auto input_ptr =
reinterpret_cast<const T *
>(input.ptr());
409 auto init_res_value =
static_cast<T
>(0.f);
417 init_res_value =
static_cast<T
>(*input_ptr);
422 init_res_value =
static_cast<T
>(1.f);
429 uint32x4x4_t vec_res_idx{ { 0 } };
432 int x = window_start_x;
433 for(; x <= (window_end_x - window_step_x); x += window_step_x)
450 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
451 vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
452 vec_res_value = temp_vec_res_value;
457 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
458 vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
459 vec_res_value = temp_vec_res_value;
484 for(
int i = 0; i < S / 4; ++i)
493 for(; x < window_end_x; ++x)
495 res += (*(input_ptr + x)) * (*(input_ptr + x));
501 for(; x < window_end_x; ++x)
503 res += *(input_ptr + x);
509 res /= in_info.dimension(0);
512 *(
reinterpret_cast<T *
>(output.ptr())) = res;
519 for(
int i = 0; i < S / 2; ++i)
525 for(; x < window_end_x; ++x)
527 res *= *(input_ptr + x);
530 *(
reinterpret_cast<T *
>(output.ptr())) = res;
535 auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
539 for(; x < window_end_x; ++x)
541 if(*(input_ptr + x) < res)
544 res = *(input_ptr + x);
547 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
552 auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
556 for(; x < window_end_x; ++x)
558 if(*(input_ptr + x) > res)
561 res = *(input_ptr + x);
564 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
572 for(; x < window_end_x; ++x)
574 res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
576 *(
reinterpret_cast<T *
>(output.ptr())) = res;
584 for(; x < window_end_x; ++x)
586 res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
588 *(
reinterpret_cast<T *
>(output.ptr())) = res;
599 template <
typename T>
600 struct RedOpX_quantized
602 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
const ReductionOperation op)
606 const TensorInfo in_info = *(in->info());
607 const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
609 const int window_step_x = 16 /
sizeof(T);
610 const auto window_start_x =
static_cast<int>(in_window.x().start());
611 const auto window_end_x =
static_cast<int>(in_window.x().end());
613 Window in_win_no_pad = in_window;
614 in_win_no_pad.set(
Window::DimX, Window::Dimension(0, 1, 1));
616 Iterator
input(in, in_win_no_pad);
617 Iterator output(out, out_window);
621 const auto input_ptr =
reinterpret_cast<T *
>(input.ptr());
623 auto vec_res_value1 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
624 auto vec_res_value2 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
625 auto vec_res_value3 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
626 auto vec_res_value4 =
wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
628 auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
629 auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
630 auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
631 auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
637 vec_res_value =
wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
640 uint32x4x4_t vec_res_idx{ { 0 } };
642 int x = window_start_x;
643 for(; x <= (window_end_x - window_step_x); x += window_step_x)
667 const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
668 const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
678 auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
679 auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
680 auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
681 auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
684 temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
685 temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
686 temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
687 temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
689 vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
690 vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
691 vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
692 vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
697 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
698 vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
699 vec_res_value = temp_vec_res_value;
704 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
705 vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
706 vec_res_value = temp_vec_res_value;
728 auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
732 for(; x < window_end_x; ++x)
734 if(*(input_ptr + x) < res)
737 res = *(input_ptr + x);
740 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
745 auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
749 for(; x < window_end_x; ++x)
751 if(*(input_ptr + x) > res)
754 res = *(input_ptr + x);
757 *(
reinterpret_cast<uint32_t *
>(output.ptr())) = idx;
765 for(; x < window_end_x; ++x)
767 res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
769 *(
reinterpret_cast<T *
>(output.ptr())) = res;
777 for(; x < window_end_x; ++x)
779 res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
781 *(
reinterpret_cast<T *
>(output.ptr())) = res;
786 auto carry_res =
wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
796 for(; x < window_end_x; ++x)
799 if(std::is_same<T, uint8_t>::value)
810 if(std::is_same<T, uint8_t>::value)
819 *
reinterpret_cast<T *
>(output.ptr()) = static_cast<T>(res);
825 auto carry_res =
wrapper::vadd(vec_res_value1, vec_res_value2);
830 carry_paddition =
wrapper::vpadd(carry_paddition, carry_paddition);
834 for(; x < window_end_x; ++x)
836 res += *(input_ptr + x);
841 res /=
static_cast<int32_t
>(in_info.dimension(0));
846 res -= (in_info.dimension(0) - 1) * iq_info.offset;
848 *
reinterpret_cast<T *
>(output.ptr()) = utils::cast::saturate_cast<T>(res);
859 template <
typename T,
int S>
863 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
866 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int axis,
const ReductionOperation op)
868 const TensorInfo in_info = *(in->info());
869 const int window_step_x = 16 /
sizeof(T);
870 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
871 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
873 const auto window_start_x =
static_cast<int>(0);
874 const auto window_end_x =
static_cast<int>(in_window.shape().x());
876 Window in_win_no_pad = in_window;
877 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
878 Window out_win_no_pad = out_window;
879 out_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
881 Iterator
input(in, in_win_no_pad);
882 Iterator output(out, out_win_no_pad);
886 const auto input_ptr =
reinterpret_cast<T *
>(input.ptr());
889 int x = window_start_x;
890 for(; x <= (window_end_x - window_step_x); x += window_step_x)
892 neon_vector vec_res_value = { 0 };
914 uint32x4x4_t vec_res_idx{ { 0 } };
916 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
918 const T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + x *
sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
934 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
935 vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
936 vec_res_value = temp_vec_res_value;
941 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
942 vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
943 vec_res_value = temp_vec_res_value;
969 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
970 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 971 if(std::is_same<T, float16_t>::value)
973 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
975 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 979 wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x *
sizeof(T)), vec_res_value);
984 for(; x < window_end_x; ++x)
986 auto res_value = 0.f;
994 res_value = *(input_ptr + x);
999 res_value =
static_cast<T
>(1.f);
1004 res_value =
static_cast<T
>(0.f);
1009 uint32_t res_idx = 0;
1010 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1012 const T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + x *
sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
1018 res_value += *in_ptr;
1021 res_value += *in_ptr * *in_ptr;
1024 res_value *= *in_ptr;
1028 if(*in_ptr < res_value)
1030 res_value = *in_ptr;
1037 if(*in_ptr > res_value)
1039 res_value = *in_ptr;
1046 res_value = *in_ptr < res_value ? *in_ptr : res_value;
1051 res_value = *in_ptr > res_value ? *in_ptr : res_value;
1061 res_value /= in_info.dimension(axis);
1066 *(
reinterpret_cast<uint32_t *
>(output.ptr()) + x) = res_idx;
1070 *(
reinterpret_cast<T *
>(output.ptr() + x *
sizeof(T))) = res_value;
1078 template <
typename T,
int S,
int axis, ReductionOperation op>
1079 struct RedOpYZW_complex
1082 using ExactTagType =
typename wrapper::traits::neon_vector<T, S>::tag_type;
1085 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int,
const ReductionOperation)
1090 const TensorInfo in_info = *(in->info());
1091 const size_t stride_z = in_info.strides_in_bytes()[axis];
1092 const int window_step_x = 16 /
sizeof(T);
1093 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
1094 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
1096 const auto window_start_x =
static_cast<int>(0);
1097 const auto window_end_x =
static_cast<int>(in_window.shape().x());
1099 Window in_win_no_pad = in_window;
1100 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1101 Window out_win_no_pad = out_window;
1102 out_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1104 Iterator
input(in, in_win_no_pad);
1105 Iterator output(out, out_win_no_pad);
1110 int x = window_start_x;
1111 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1113 neon_vector vec_res_value_0 = { 0 };
1114 neon_vector vec_res_value_1 = { 0 };
1116 vec_res_value_0 =
wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
1117 vec_res_value_1 =
wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
1119 T *out_ptr =
reinterpret_cast<T *
>(output.ptr() + 2 * x *
sizeof(T));
1120 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1122 T *in_ptr_0 =
reinterpret_cast<T *
>(input.ptr() + 2 * x *
sizeof(T) + stride_z * dim);
1123 T *in_ptr_1 =
reinterpret_cast<T *
>(input.ptr() + 2 * x *
sizeof(T) + 16 + stride_z * dim);
1128 vec_res_value_0 =
wrapper::vadd(vec_elements_0, vec_res_value_0);
1129 vec_res_value_1 =
wrapper::vadd(vec_elements_1, vec_res_value_1);
1137 for(; x < window_end_x; ++x)
1139 auto res_value_0 = 0.f;
1140 auto res_value_1 = 0.f;
1142 T *out_ptr =
reinterpret_cast<T *
>(output.ptr() + 2 * x *
sizeof(T));
1143 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1145 T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + 2 * x *
sizeof(T) + stride_z * dim);
1146 res_value_0 += *in_ptr;
1147 res_value_1 += *(in_ptr + 1);
1149 *out_ptr = res_value_0;
1150 *(out_ptr + 1) = res_value_1;
1157 template <
typename T>
1158 struct RedOpYZW_quantized
1160 inline void operator()(
const Window &in_window, Window &out_window,
const ITensor *in, ITensor *out,
int axis,
const ReductionOperation op)
1162 const TensorInfo in_info = *(in->info());
1163 const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
1166 const int window_step_x = 16 /
sizeof(T);
1167 const auto window_start_x_tmp =
static_cast<int>(in_window.x().start());
1168 const auto window_end_x_tmp =
static_cast<int>(in_window.x().end());
1170 const auto window_start_x =
static_cast<int>(0);
1171 const auto window_end_x =
static_cast<int>(in_window.shape().x());
1173 Window in_win_no_pad = in_window;
1174 in_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1175 Window out_win_no_pad = out_window;
1176 out_win_no_pad.set(
Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1178 Iterator
input(in, in_win_no_pad);
1179 Iterator output(out, out_win_no_pad);
1183 const auto input_ptr =
reinterpret_cast<T *
>(input.ptr());
1186 int x = window_start_x;
1187 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1189 uint32x4x4_t vec_res_idx{ { 0 } };
1190 auto vec_res_value1 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1191 auto vec_res_value2 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1192 auto vec_res_value3 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1193 auto vec_res_value4 =
wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1195 auto vec_res_value1_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1196 auto vec_res_value2_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1197 auto vec_res_value3_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1198 auto vec_res_value4_f =
wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1202 for(
unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
1204 const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
1219 vec_res_value1 =
wrapper::vadd(temp32x4t_1, vec_res_value1);
1220 vec_res_value2 =
wrapper::vadd(temp32x4t_2, vec_res_value2);
1221 vec_res_value3 =
wrapper::vadd(temp32x4t_3, vec_res_value3);
1222 vec_res_value4 =
wrapper::vadd(temp32x4t_4, vec_res_value4);
1227 const auto offset32x4f_4 =
wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1228 const auto scale32x4f_4 =
wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
1238 auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
1239 auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
1240 auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
1241 auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
1249 vec_res_value1_f =
wrapper::vmul(temp32x4f_1, vec_res_value1_f);
1250 vec_res_value2_f =
wrapper::vmul(temp32x4f_2, vec_res_value2_f);
1251 vec_res_value3_f =
wrapper::vmul(temp32x4f_3, vec_res_value3_f);
1252 vec_res_value4_f =
wrapper::vmul(temp32x4f_4, vec_res_value4_f);
1257 auto temp_vec_res_value =
wrapper::vmin(vec_elements, vec_res_value);
1258 vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1259 vec_res_value = temp_vec_res_value;
1264 auto temp_vec_res_value =
wrapper::vmax(vec_elements, vec_res_value);
1265 vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1266 vec_res_value = temp_vec_res_value;
1289 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
1290 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
1291 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
1292 wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
1298 wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
1304 auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
1311 vec_res_s_value1 =
wrapper::vsub(vec_res_s_value1, offsets);
1312 vec_res_s_value2 =
wrapper::vsub(vec_res_s_value2, offsets);
1313 vec_res_s_value3 =
wrapper::vsub(vec_res_s_value3, offsets);
1314 vec_res_s_value4 =
wrapper::vsub(vec_res_s_value4, offsets);
1319 combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
1324 const auto vec_width_inv =
wrapper::vinv(
wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
1325 vec_res_value1_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
1326 vec_res_value2_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
1327 vec_res_value3_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
1328 vec_res_value4_f =
wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
1330 vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1331 vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1332 vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1333 vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1344 const auto offset32x4f_4 =
wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1345 const auto iscale32x4f_4 =
vinvq_f32(vdupq_n_f32(iq_info.scale));
1353 vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1354 vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1355 vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1356 vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1371 for(; x < window_end_x; ++x)
1373 float res_value = 0.f;
1381 res_value = *(input_ptr + x);
1386 res_value =
static_cast<T
>(1.0f);
1391 res_value =
static_cast<T
>(0.0f);
1395 uint32_t res_idx = 0;
1397 for(
unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1399 const T *in_ptr =
reinterpret_cast<T *
>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
1405 res_value += *in_ptr;
1410 res_value += *in_ptr * *in_ptr;
1416 if(std::is_same<T, uint8_t>::value)
1428 if(*in_ptr < res_value)
1430 res_value = *in_ptr;
1437 if(*in_ptr > res_value)
1439 res_value = *in_ptr;
1446 res_value = *in_ptr < res_value ? *in_ptr : res_value;
1451 res_value = *in_ptr > res_value ? *in_ptr : res_value;
1463 int32_t res =
static_cast<int32_t
>(res_value);
1464 res /=
static_cast<int32_t
>(in_info.dimension(axis));
1465 *
reinterpret_cast<T *
>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
1471 res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
1472 *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
1479 if(std::is_same<T, uint8_t>::value)
1487 *(
reinterpret_cast<T *
>(output.ptr() + x)) = res;
1493 *(
reinterpret_cast<uint32_t *
>(output.ptr() + x * 4)) = res_idx;
1497 *(
reinterpret_cast<T *
>(output.ptr() + x)) = res_value;
1505 void reduce_op(
const Window &window,
const ITensor *input, ITensor *output,
unsigned int axis,
const ReductionOperation op)
1507 const bool is_complex = (input->info()->num_channels() == 2);
1514 switch(input->info()->data_type())
1520 return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
1535 switch(input->info()->data_type())
1538 return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
1540 return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
1541 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1543 return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
1544 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1546 return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
1548 return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
1553 switch(input->info()->data_type())
1556 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1558 return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1559 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1561 return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
1562 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1564 return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
1566 return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
1571 switch(input->info()->data_type())
1574 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1576 return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1577 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1579 return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
1580 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1582 return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
1584 return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
1589 switch(input->info()->data_type())
1592 return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1594 return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1595 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1597 return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
1598 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1600 return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
1602 return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
1618 if(input->num_channels() == 1)
1632 if(output->total_size() != 0)
1647 const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
1669 _reduction_axis = axis;
1676 INEKernel::configure(win);
1683 auto_init_if_empty(*output->
info(), input->
info()->
clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(
true));
1700 reduce_op(window, _input, _output, _reduction_axis, _op);
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8x8_t vorr(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
float dequantize_qasymm8(uint8_t value, const INFO_TYPE &qinfo)
Dequantize a value given an unsigned 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
ReductionOperation
Available reduction operations.
uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given an unsigned 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
float32x2_t vinv(const float32x2_t &a)
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
NEReductionOperationKernel()
Default constructor.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
Static function to check if given info will lead to a valid configuration of NEReductionOperationKern...
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
int16x4_t vreinterpret(const uint16x4_t &a)
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vand(const uint8x8_t &a, const uint8x8_t &b)
TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims=true)
Calculate the reduced shape of a tensor given an axis.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
uint8x8_t vgetlow(const uint8x16_t val)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
void configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
Set the source, destination of the kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
uint8x8_t vgethigh(const uint8x16_t val)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
float32x4_t vinvq_f32(float32x4_t x)
Calculate reciprocal.
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vclt(const uint8x8_t &a, const uint8x8_t &b)
float dequantize_qasymm8_signed(int8_t value, const INFO_TYPE &qinfo)
Dequantize a value given a signed 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Container for valid region of a window.
uint8x8_t vpmin(const uint8x8_t &a, const uint8x8_t &b)
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
DataType
Available data types.
uint16x8_t vmovl(const uint8x8_t &a)
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Describe a multidimensional execution window.
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
uint32x2_t vqmovun(const int64x2_t &a)