Compute Library
 21.02
NEReductionOperationKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
31 #include "arm_compute/core/Utils.h"
34 #include "src/core/CPP/Validate.h"
36 #include "src/core/NEON/NEMath.h"
39 #include "support/SaturateCast.h"
40 
42 #include <arm_neon.h>
43 
44 namespace arm_compute
45 {
46 namespace
47 {
48 // Helper function that calls vqmovun/vqmvn, vcombine and vstore, allows templating of RedOpYZW_quantized
49 template <typename T>
50 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
51 {
52  if(std::is_same<T, uint8_t>::value)
53  {
55  wrapper::vstore(output.ptr() + offset, res);
56  }
57  else
58  {
60  wrapper::vstore(reinterpret_cast<int8_t *>(output.ptr() + offset), res);
61  }
62 }
63 
64 template <typename T>
65 uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
66 {
67  uint32x4_t mask{ 0 };
69  {
70  mask = wrapper::vcgt(b, a);
71  }
72  else
73  {
74  mask = wrapper::vclt(b, a);
75  }
76 
77  uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
78  if(axis != 0)
79  {
80  vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
81  }
82  uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
83 
84  return res;
85 }
86 
87 template <typename T>
88 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
89 {
90  uint32x4x4_t mask{ { 0 } };
91  uint8x16_t mask_u8{ 0 };
93  {
94  mask_u8 = wrapper::vcgt(b, a);
95  }
96  else
97  {
98  mask_u8 = wrapper::vclt(b, a);
99  }
100  auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
101  auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
102  mask.val[0] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
103  mask.val[1] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
104  mask.val[2] = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
105  mask.val[3] = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
106 
107  uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
108  { idx + 4, idx + 5, idx + 6, idx + 7 },
109  { idx + 8, idx + 9, idx + 10, idx + 11 },
110  { idx + 12, idx + 13, idx + 14, idx + 15 }
111  }
112  };
113  if(axis != 0)
114  {
115  vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
116  vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
117  vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
118  vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
119  }
120  uint32x4x4_t res =
121  {
122  {
123  vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
124  vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
125  vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
126  vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
127  }
128  };
129 
130  return res;
131 }
132 
133 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
134 template <typename T>
135 inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
136  typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
137  calculate_min(T in)
138 {
139  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
140  return wrapper::vpmin(pmin, pmin);
141 }
142 
143 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
144 template <typename T>
145 inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
146  typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
147  calculate_min(T in)
148 {
149  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
150  pmin = wrapper::vpmin(pmin, pmin);
151  pmin = wrapper::vpmin(pmin, pmin);
152  return wrapper::vpmin(pmin, pmin);
153 }
154 
155 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
156 template <typename T>
157 inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
158  typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
159  calculate_max(T in)
160 {
161  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
162  return wrapper::vpmax(pmax, pmax);
163 }
164 
165 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
166 template <typename T>
167 inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
168  typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
169  calculate_max(T in)
170 {
171  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
172  pmax = wrapper::vpmax(pmax, pmax);
173  pmax = wrapper::vpmax(pmax, pmax);
174  return wrapper::vpmax(pmax, pmax);
175 }
176 
177 template <typename T>
178 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
179 {
180  uint32x4_t res_idx_mask{ 0 };
181  uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
182 
184  {
185  auto pmin = calculate_min(vec_res_value);
186  auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
187  res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
188  }
189  else
190  {
191  auto pmax = calculate_max(vec_res_value);
192  auto mask = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
193  res_idx_mask = wrapper::vand(vec_res_idx.val[0], mask);
194  }
195 
196  res_idx_mask = wrapper::vadd(res_idx_mask, mask_ones);
197  auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask), wrapper::vgetlow(res_idx_mask));
198  pmin = wrapper::vpmin(pmin, pmin);
199  uint32_t res = wrapper::vgetlane(pmin, 0);
200 
201  return (res - 0xFFFFFFFF);
202 }
203 
204 template <typename T>
205 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
206 {
207  uint32x4x4_t res_idx_mask{ { 0 } };
208  uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
209  uint8x16_t mask_u8{ 0 };
211  {
212  auto pmin = calculate_min(vec_res_value);
213  mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
214  }
215  else
216  {
217  auto pmax = calculate_max(vec_res_value);
218  mask_u8 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
219  }
220 
221  // Widen vectors
222  auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
223  auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
224  auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
225  auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
226  auto wide_u32_3 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
227  auto wide_u32_4 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
228  res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
229  res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
230  res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
231  res_idx_mask.val[3] = wrapper::vand(vec_res_idx.val[3], wide_u32_4);
232  res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
233  res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
234  res_idx_mask.val[2] = wrapper::vadd(res_idx_mask.val[2], mask_ones);
235  res_idx_mask.val[3] = wrapper::vadd(res_idx_mask.val[3], mask_ones);
236 
237  uint32_t res = 0xFFFFFFFF;
238  int iter = 0;
239  do
240  {
241  auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
242  pmin = wrapper::vpmin(pmin, pmin);
243  res = std::min(wrapper::vgetlane(pmin, 0), res);
244  iter++;
245  }
246  while(iter < 4);
247 
248  return (res - 0xFFFFFFFF);
249 }
250 
251 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
252 template <>
253 uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
254 {
255  uint32x4x2_t mask{ 0 };
256  uint16x8_t mask_u16{ 0 };
258  {
259  mask_u16 = wrapper::vcgt(b, a);
260  }
261  else
262  {
263  mask_u16 = wrapper::vclt(b, a);
264  }
265  mask.val[0] = wrapper::vmovl(wrapper::vgetlow(mask_u16));
266  mask.val[1] = wrapper::vmovl(wrapper::vgethigh(mask_u16));
267  uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
268  { idx + 4, idx + 5, idx + 6, idx + 7 }
269  }
270  };
271  if(axis != 0)
272  {
273  vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
274  vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
275  }
276  uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
277  wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
278  0, 0
279  };
280 
281  return res;
282 }
283 
284 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
285 inline float16x4_t calculate_min(float16x8_t in)
286 {
287  auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
288  pmin = wrapper::vpmin(pmin, pmin);
289  return wrapper::vpmin(pmin, pmin);
290 }
291 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
292 inline float16x4_t calculate_max(float16x8_t in)
293 {
294  auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
295  pmax = wrapper::vpmax(pmax, pmax);
296  return wrapper::vpmax(pmax, pmax);
297 }
298 
299 template <>
300 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
301 {
302  uint32x4x2_t res_idx_mask{ 0 };
303  uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
304  uint16x8_t mask_u16;
306  {
307  auto pmin = calculate_min(vec_res_value);
308  mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
309  }
310  else
311  {
312  auto pmax = calculate_max(vec_res_value);
313  mask_u16 = wrapper::vceq(vec_res_value, wrapper::vcombine(pmax, pmax));
314  }
315 
316  // Widen vectors
317  auto wide_u32_1 = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
318  auto wide_u32_2 = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
319  res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
320  res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
321  res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
322  res_idx_mask.val[1] = wrapper::vadd(res_idx_mask.val[1], mask_ones);
323 
324  uint32_t res = 0xFFFFFFFF;
325  int iter = 0;
326  do
327  {
328  auto pmin = wrapper::vpmin(wrapper::vgethigh(res_idx_mask.val[iter]), wrapper::vgetlow(res_idx_mask.val[iter]));
329  pmin = wrapper::vpmin(pmin, pmin);
330  res = std::min(wrapper::vgetlane(pmin, 0), res);
331  iter++;
332  }
333  while(iter < 2);
334 
335  return (res - 0xFFFFFFFF);
336 }
337 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
338 
339 template <class F>
340 class Reducer
341 {
342 public:
343  static void reduceX(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
344  {
345  // Set out window
346  Window out_window(window);
347  out_window.set(Window::DimX, Window::Dimension(0, 1, 1));
348 
349  f(window, out_window, input, output, op);
350  }
351  static void reduceY(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
352  {
353  // Set in window
354  Window in_window(window);
355  Window out_window(window);
356 
357  in_window.set(Window::DimY, Window::Dimension(0, 1, 1));
358  out_window.set(Window::DimY, Window::Dimension(0, output->info()->dimension(1), output->info()->dimension(1)));
359 
360  f(in_window, out_window, input, output, 1, op);
361  }
362  static void reduceZ(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
363  {
364  // Set in window
365  Window in_window(window);
366  Window out_window(window);
367 
368  in_window.set(Window::DimZ, Window::Dimension(0, 1, 1));
369  out_window.set(Window::DimZ, Window::Dimension(0, output->info()->dimension(2), output->info()->dimension(2)));
370 
371  f(in_window, out_window, input, output, 2, op);
372  }
373  static void reduceW(const Window &window, const ITensor *input, ITensor *output, F f, const ReductionOperation op)
374  {
375  // Set in/out window
376  Window in_window(window);
377  Window out_window(window);
378 
379  in_window.set(3, Window::Dimension(0, 1, 1));
380  out_window.set(3, Window::Dimension(0, 1, 1));
381 
382  f(in_window, out_window, input, output, 3, op);
383  }
384 };
385 
386 template <typename T, int S>
387 struct RedOpX
388 {
389  /** Neon vector tag type. */
390  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
391 
392  inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
393  {
394  const TensorInfo in_info = *(in->info());
395  const int window_step_x = 16 / sizeof(T);
396  const auto window_start_x = static_cast<int>(in_window.x().start());
397  const auto window_end_x = static_cast<int>(in_window.x().end());
398 
399  Window in_win_no_pad = in_window;
400  in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
401 
402  Iterator input(in, in_win_no_pad);
403  Iterator output(out, out_window);
404 
405  execute_window_loop(in_win_no_pad, [&](const Coordinates &)
406  {
407  const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
408 
409  auto init_res_value = static_cast<T>(0.f);
410  switch(op)
411  {
416  {
417  init_res_value = static_cast<T>(*input_ptr);
418  break;
419  }
421  {
422  init_res_value = static_cast<T>(1.f);
423  break;
424  }
425  default:
426  break;
427  }
428  auto vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
429  uint32x4x4_t vec_res_idx{ { 0 } };
430 
431  // Compute window_step_x elements per iteration
432  int x = window_start_x;
433  for(; x <= (window_end_x - window_step_x); x += window_step_x)
434  {
435  const auto vec_elements = wrapper::vloadq(input_ptr + x);
436  switch(op)
437  {
439  vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
440  break;
443  vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
444  break;
446  vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
447  break;
449  {
450  auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
451  vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
452  vec_res_value = temp_vec_res_value;
453  break;
454  }
456  {
457  auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
458  vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
459  vec_res_value = temp_vec_res_value;
460  break;
461  }
463  {
464  vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
465  break;
466  }
468  {
469  vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
470  break;
471  }
472  default:
473  ARM_COMPUTE_ERROR("Not supported");
474  }
475  }
476 
477  switch(op)
478  {
482  {
483  auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
484  for(int i = 0; i < S / 4; ++i)
485  {
486  carry_res = wrapper::vpadd(carry_res, carry_res);
487  }
488  auto res = wrapper::vgetlane(carry_res, 0);
489 
491  {
492  // Compute left-over elements
493  for(; x < window_end_x; ++x)
494  {
495  res += (*(input_ptr + x)) * (*(input_ptr + x));
496  }
497  }
498  else
499  {
500  // Compute left-over elements
501  for(; x < window_end_x; ++x)
502  {
503  res += *(input_ptr + x);
504  }
505  }
506 
508  {
509  res /= in_info.dimension(0);
510  }
511 
512  *(reinterpret_cast<T *>(output.ptr())) = res;
513  break;
514  }
516  {
517  auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
518  T res = 1;
519  for(int i = 0; i < S / 2; ++i)
520  {
521  res *= wrapper::vgetlane(carry_res, i);
522  }
523 
524  // Compute left-over elements
525  for(; x < window_end_x; ++x)
526  {
527  res *= *(input_ptr + x);
528  }
529 
530  *(reinterpret_cast<T *>(output.ptr())) = res;
531  break;
532  }
534  {
535  auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
536  auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
537 
538  // Compute left-over elements
539  for(; x < window_end_x; ++x)
540  {
541  if(*(input_ptr + x) < res)
542  {
543  idx = x;
544  res = *(input_ptr + x);
545  }
546  }
547  *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
548  break;
549  }
551  {
552  auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
553  auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
554 
555  // Compute left-over elements
556  for(; x < window_end_x; ++x)
557  {
558  if(*(input_ptr + x) > res)
559  {
560  idx = x;
561  res = *(input_ptr + x);
562  }
563  }
564  *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
565  break;
566  }
568  {
569  auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
570 
571  // Compute left-over elements
572  for(; x < window_end_x; ++x)
573  {
574  res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
575  }
576  *(reinterpret_cast<T *>(output.ptr())) = res;
577  break;
578  }
580  {
581  auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
582 
583  // Compute left-over elements
584  for(; x < window_end_x; ++x)
585  {
586  res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
587  }
588  *(reinterpret_cast<T *>(output.ptr())) = res;
589  break;
590  }
591  default:
592  ARM_COMPUTE_ERROR("Not supported");
593  }
594  },
595  input, output);
596  }
597 };
598 
599 template <typename T>
600 struct RedOpX_quantized
601 {
602  inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
603  {
605 
606  const TensorInfo in_info = *(in->info());
607  const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
608 
609  const int window_step_x = 16 / sizeof(T);
610  const auto window_start_x = static_cast<int>(in_window.x().start());
611  const auto window_end_x = static_cast<int>(in_window.x().end());
612 
613  Window in_win_no_pad = in_window;
614  in_win_no_pad.set(Window::DimX, Window::Dimension(0, 1, 1));
615 
616  Iterator input(in, in_win_no_pad);
617  Iterator output(out, out_window);
618 
619  execute_window_loop(in_win_no_pad, [&](const Coordinates &)
620  {
621  const auto input_ptr = reinterpret_cast<T *>(input.ptr());
622 
623  auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
624  auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
625  auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
626  auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
627 
628  auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
629  auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
630  auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
631  auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
632 
633  typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
634 
636  {
637  vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
638  }
639 
640  uint32x4x4_t vec_res_idx{ { 0 } };
641  // Compute window_step_x elements per iteration
642  int x = window_start_x;
643  for(; x <= (window_end_x - window_step_x); x += window_step_x)
644  {
645  const auto vec_elements = wrapper::vloadq(input_ptr + x);
646  switch(op)
647  {
650  {
651  const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
652  const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
653 
654  const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
655  const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
656  const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
657  const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
658 
659  vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
660  vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
661  vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
662  vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
663  break;
664  }
666  {
667  const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
668  const auto scale32x4f_4 = vdupq_n_f32(iq_info.scale);
669 
670  const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
671  const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
672 
673  const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
674  const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
675  const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
676  const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
677 
678  auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
679  auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
680  auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
681  auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
682 
683  //de-quantize vec_elements
684  temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
685  temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
686  temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
687  temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
688 
689  vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
690  vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
691  vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
692  vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
693  break;
694  }
696  {
697  auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
698  vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
699  vec_res_value = temp_vec_res_value;
700  break;
701  }
703  {
704  auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
705  vec_res_idx = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
706  vec_res_value = temp_vec_res_value;
707  break;
708  }
710  {
711  vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
712  break;
713  }
715  {
716  vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
717  break;
718  }
719  default:
720  ARM_COMPUTE_ERROR("Not supported");
721  }
722  }
723 
724  switch(op)
725  {
727  {
728  auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
729  auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
730 
731  // Compute left-over elements
732  for(; x < window_end_x; ++x)
733  {
734  if(*(input_ptr + x) < res)
735  {
736  idx = x;
737  res = *(input_ptr + x);
738  }
739  }
740  *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
741  break;
742  }
744  {
745  auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
746  auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
747 
748  // Compute left-over elements
749  for(; x < window_end_x; ++x)
750  {
751  if(*(input_ptr + x) > res)
752  {
753  idx = x;
754  res = *(input_ptr + x);
755  }
756  }
757  *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
758  break;
759  }
761  {
762  auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
763 
764  // Compute left-over elements
765  for(; x < window_end_x; ++x)
766  {
767  res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
768  }
769  *(reinterpret_cast<T *>(output.ptr())) = res;
770  break;
771  }
773  {
774  auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
775 
776  // Compute left-over elements
777  for(; x < window_end_x; ++x)
778  {
779  res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
780  }
781  *(reinterpret_cast<T *>(output.ptr())) = res;
782  break;
783  }
785  {
786  auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
787  carry_res = wrapper::vmul(carry_res, vec_res_value3_f);
788  carry_res = wrapper::vmul(carry_res, vec_res_value4_f);
789 
790  float res = wrapper::vgetlane(carry_res, 0);
791  res *= wrapper::vgetlane(carry_res, 1);
792  res *= wrapper::vgetlane(carry_res, 2);
793  res *= wrapper::vgetlane(carry_res, 3);
794 
795  // Compute left-over elements
796  for(; x < window_end_x; ++x)
797  {
798  //de-quantize input
799  if(std::is_same<T, uint8_t>::value)
800  {
801  res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
802  }
803  else
804  {
805  res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
806  }
807  }
808 
809  //re-quantize result
810  if(std::is_same<T, uint8_t>::value)
811  {
812  res = quantize_qasymm8(res, iq_info);
813  }
814  else
815  {
816  res = quantize_qasymm8_signed(res, iq_info);
817  }
818 
819  *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
820  break;
821  }
824  {
825  auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
826  carry_res = wrapper::vadd(carry_res, vec_res_value3);
827  carry_res = wrapper::vadd(carry_res, vec_res_value4);
828 
829  auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
830  carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
831  auto res = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
832 
833  // Compute left-over elements
834  for(; x < window_end_x; ++x)
835  {
836  res += *(input_ptr + x);
837  }
838 
840  {
841  res /= static_cast<int32_t>(in_info.dimension(0));
842  }
843  else
844  {
845  // Subtract accumulated offsets
846  res -= (in_info.dimension(0) - 1) * iq_info.offset;
847  }
848  *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
849  break;
850  }
851  default:
852  ARM_COMPUTE_ERROR("Not supported");
853  }
854  },
855  input, output);
856  }
857 };
858 
859 template <typename T, int S>
860 struct RedOpYZW
861 {
862  /** Neon vector tag type. */
863  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
864  using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
865 
866  inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
867  {
868  const TensorInfo in_info = *(in->info());
869  const int window_step_x = 16 / sizeof(T);
870  const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
871  const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
872  // As it split over x-axis, need to set the correct spiltted window start and end.
873  const auto window_start_x = static_cast<int>(0);
874  const auto window_end_x = static_cast<int>(in_window.shape().x());
875 
876  Window in_win_no_pad = in_window;
877  in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
878  Window out_win_no_pad = out_window;
879  out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
880 
881  Iterator input(in, in_win_no_pad);
882  Iterator output(out, out_win_no_pad);
883 
884  execute_window_loop(in_win_no_pad, [&](const Coordinates &)
885  {
886  const auto input_ptr = reinterpret_cast<T *>(input.ptr());
887 
888  // Compute window_step_x elements per iteration
889  int x = window_start_x;
890  for(; x <= (window_end_x - window_step_x); x += window_step_x)
891  {
892  neon_vector vec_res_value = { 0 };
893  switch(op)
894  {
899  {
900  vec_res_value = wrapper::vloadq(input_ptr + x);
901  break;
902  }
904  {
905  vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
906  break;
907  }
908  default:
909  {
910  vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
911  break;
912  }
913  }
914  uint32x4x4_t vec_res_idx{ { 0 } };
915 
916  for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
917  {
918  const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
919  const auto vec_elements = wrapper::vloadq(in_ptr);
920  switch(op)
921  {
924  vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
925  break;
927  vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
928  break;
930  vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
931  break;
933  {
934  auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
935  vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
936  vec_res_value = temp_vec_res_value;
937  break;
938  }
940  {
941  auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
942  vec_res_idx = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
943  vec_res_value = temp_vec_res_value;
944  break;
945  }
947  {
948  vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
949  break;
950  }
952  {
953  vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
954  break;
955  }
956  default:
957  ARM_COMPUTE_ERROR("Not supported");
958  }
959  }
960 
962  {
963  auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
964  vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
965  }
966 
968  {
969  wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
970 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
971  if(std::is_same<T, float16_t>::value)
972  {
973  wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
974  }
975 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
976  }
977  else
978  {
979  wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
980  }
981  }
982 
983  // Compute left-over elements
984  for(; x < window_end_x; ++x)
985  {
986  auto res_value = 0.f;
987  switch(op)
988  {
993  {
994  res_value = *(input_ptr + x);
995  break;
996  }
998  {
999  res_value = static_cast<T>(1.f);
1000  break;
1001  }
1002  default:
1003  {
1004  res_value = static_cast<T>(0.f);
1005  break;
1006  }
1007  }
1008 
1009  uint32_t res_idx = 0;
1010  for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1011  {
1012  const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
1013 
1014  switch(op)
1015  {
1018  res_value += *in_ptr;
1019  break;
1021  res_value += *in_ptr * *in_ptr;
1022  break;
1024  res_value *= *in_ptr;
1025  break;
1027  {
1028  if(*in_ptr < res_value)
1029  {
1030  res_value = *in_ptr;
1031  res_idx = dim;
1032  }
1033  break;
1034  }
1036  {
1037  if(*in_ptr > res_value)
1038  {
1039  res_value = *in_ptr;
1040  res_idx = dim;
1041  }
1042  break;
1043  }
1045  {
1046  res_value = *in_ptr < res_value ? *in_ptr : res_value;
1047  break;
1048  }
1050  {
1051  res_value = *in_ptr > res_value ? *in_ptr : res_value;
1052  break;
1053  }
1054  default:
1055  ARM_COMPUTE_ERROR("Not supported");
1056  }
1057  }
1058 
1060  {
1061  res_value /= in_info.dimension(axis);
1062  }
1063 
1065  {
1066  *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
1067  }
1068  else
1069  {
1070  *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
1071  }
1072  }
1073  },
1074  input, output);
1075  }
1076 };
1077 
1078 template <typename T, int S, int axis, ReductionOperation op>
1079 struct RedOpYZW_complex
1080 {
1081  /** Neon vector tag type. */
1082  using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
1083  using neon_vector = typename wrapper::traits::neon_vector<T, S>::type;
1084 
1085  inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
1086  {
1087  ARM_COMPUTE_ERROR_ON(axis != 2);
1089 
1090  const TensorInfo in_info = *(in->info());
1091  const size_t stride_z = in_info.strides_in_bytes()[axis];
1092  const int window_step_x = 16 / sizeof(T);
1093  const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
1094  const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
1095  // As it split over x-axis, need to set the correct spiltted window start and end.
1096  const auto window_start_x = static_cast<int>(0);
1097  const auto window_end_x = static_cast<int>(in_window.shape().x());
1098 
1099  Window in_win_no_pad = in_window;
1100  in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1101  Window out_win_no_pad = out_window;
1102  out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1103 
1104  Iterator input(in, in_win_no_pad);
1105  Iterator output(out, out_win_no_pad);
1106 
1107  execute_window_loop(in_win_no_pad, [&](const Coordinates &)
1108  {
1109  // Compute window_step_x elements per iteration
1110  int x = window_start_x;
1111  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1112  {
1113  neon_vector vec_res_value_0 = { 0 };
1114  neon_vector vec_res_value_1 = { 0 };
1115 
1116  vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
1117  vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
1118 
1119  T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
1120  for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1121  {
1122  T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
1123  T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
1124 
1125  const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
1126  const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
1127 
1128  vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
1129  vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
1130  }
1131 
1132  wrapper::vstore(out_ptr, vec_res_value_0);
1133  wrapper::vstore(out_ptr + 4, vec_res_value_1);
1134  }
1135 
1136  // Compute left-over elements
1137  for(; x < window_end_x; ++x)
1138  {
1139  auto res_value_0 = 0.f;
1140  auto res_value_1 = 0.f;
1141 
1142  T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
1143  for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1144  {
1145  T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
1146  res_value_0 += *in_ptr;
1147  res_value_1 += *(in_ptr + 1);
1148  }
1149  *out_ptr = res_value_0;
1150  *(out_ptr + 1) = res_value_1;
1151  }
1152  },
1153  input, output);
1154  }
1155 };
1156 
1157 template <typename T>
1158 struct RedOpYZW_quantized
1159 {
1160  inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
1161  {
1162  const TensorInfo in_info = *(in->info());
1163  const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
1165 
1166  const int window_step_x = 16 / sizeof(T);
1167  const auto window_start_x_tmp = static_cast<int>(in_window.x().start());
1168  const auto window_end_x_tmp = static_cast<int>(in_window.x().end());
1169  // As it split over x-axis, need to set the correct spiltted window start and end.
1170  const auto window_start_x = static_cast<int>(0);
1171  const auto window_end_x = static_cast<int>(in_window.shape().x());
1172 
1173  Window in_win_no_pad = in_window;
1174  in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
1175  Window out_win_no_pad = out_window;
1176  out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
1177 
1178  Iterator input(in, in_win_no_pad);
1179  Iterator output(out, out_win_no_pad);
1180 
1181  execute_window_loop(in_win_no_pad, [&](const Coordinates &)
1182  {
1183  const auto input_ptr = reinterpret_cast<T *>(input.ptr());
1184 
1185  // Compute window_step_x elements per iteration
1186  int x = window_start_x;
1187  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1188  {
1189  uint32x4x4_t vec_res_idx{ { 0 } };
1190  auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1191  auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1192  auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1193  auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
1194 
1195  auto vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1196  auto vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1197  auto vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1198  auto vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
1199 
1200  auto vec_res_value = wrapper::vloadq(input_ptr + x);
1201 
1202  for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
1203  {
1204  const T *in_ptr = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
1205  const auto vec_elements = wrapper::vloadq(in_ptr);
1206  switch(op)
1207  {
1210  {
1211  const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
1212  const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
1213 
1214  const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
1215  const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
1216  const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
1217  const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
1218 
1219  vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
1220  vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
1221  vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
1222  vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
1223  break;
1224  }
1226  {
1227  const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1228  const auto scale32x4f_4 = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
1229 
1230  const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
1231  const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
1232 
1233  const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
1234  const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
1235  const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
1236  const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
1237 
1238  auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
1239  auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
1240  auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
1241  auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
1242 
1243  //de-quantize vec_elements
1244  temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
1245  temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
1246  temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
1247  temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
1248 
1249  vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
1250  vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
1251  vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
1252  vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
1253  break;
1254  }
1256  {
1257  auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
1258  vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1259  vec_res_value = temp_vec_res_value;
1260  break;
1261  }
1263  {
1264  auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
1265  vec_res_idx = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
1266  vec_res_value = temp_vec_res_value;
1267  break;
1268  }
1270  {
1271  vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
1272  break;
1273  }
1275  {
1276  vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
1277  break;
1278  }
1279  default:
1280  ARM_COMPUTE_ERROR("Not supported");
1281  }
1282  }
1283 
1284  switch(op)
1285  {
1288  {
1289  wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
1290  wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
1291  wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
1292  wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
1293  break;
1294  }
1297  {
1298  wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
1299  break;
1300  }
1302  {
1303  // Subtract offsets
1304  auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
1305 
1306  auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
1307  auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
1308  auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
1309  auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
1310 
1311  vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
1312  vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
1313  vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
1314  vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
1315 
1316  const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
1317  const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
1318 
1319  combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
1320  break;
1321  }
1323  {
1324  const auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<float>(in_info.dimension(axis)), wrapper::traits::vector_128_tag{}));
1325  vec_res_value1_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value1), vec_width_inv);
1326  vec_res_value2_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value2), vec_width_inv);
1327  vec_res_value3_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value3), vec_width_inv);
1328  vec_res_value4_f = wrapper::vmul(wrapper::vcvt<float>(vec_res_value4), vec_width_inv);
1329 
1330  vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1331  vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1332  vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1333  vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1334 
1335  const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
1336  const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
1337  auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
1338 
1339  wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
1340  break;
1341  }
1343  {
1344  const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
1345  const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
1346 
1347  //re-quantize
1348  vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
1349  vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
1350  vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
1351  vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
1352 
1353  vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
1354  vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
1355  vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
1356  vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
1357 
1358  const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
1359  const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
1360  auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
1361 
1362  wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
1363  break;
1364  }
1365  default:
1366  ARM_COMPUTE_ERROR("Not supported");
1367  }
1368  }
1369 
1370  // Compute left-over elements
1371  for(; x < window_end_x; ++x)
1372  {
1373  float res_value = 0.f;
1374  switch(op)
1375  {
1380  {
1381  res_value = *(input_ptr + x);
1382  break;
1383  }
1385  {
1386  res_value = static_cast<T>(1.0f);
1387  break;
1388  }
1389  default:
1390  {
1391  res_value = static_cast<T>(0.0f);
1392  break;
1393  }
1394  }
1395  uint32_t res_idx = 0;
1396 
1397  for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
1398  {
1399  const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
1400  switch(op)
1401  {
1404  {
1405  res_value += *in_ptr;
1406  break;
1407  }
1409  {
1410  res_value += *in_ptr * *in_ptr;
1411  break;
1412  }
1414  {
1415  //de-quantize input
1416  if(std::is_same<T, uint8_t>::value)
1417  {
1418  res_value *= dequantize_qasymm8(*in_ptr, iq_info);
1419  }
1420  else
1421  {
1422  res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
1423  }
1424  break;
1425  }
1427  {
1428  if(*in_ptr < res_value)
1429  {
1430  res_value = *in_ptr;
1431  res_idx = dim;
1432  }
1433  break;
1434  }
1436  {
1437  if(*in_ptr > res_value)
1438  {
1439  res_value = *in_ptr;
1440  res_idx = dim;
1441  }
1442  break;
1443  }
1445  {
1446  res_value = *in_ptr < res_value ? *in_ptr : res_value;
1447  break;
1448  }
1450  {
1451  res_value = *in_ptr > res_value ? *in_ptr : res_value;
1452  break;
1453  }
1454  default:
1455  ARM_COMPUTE_ERROR("Not supported");
1456  }
1457  }
1458 
1459  switch(op)
1460  {
1462  {
1463  int32_t res = static_cast<int32_t>(res_value);
1464  res /= static_cast<int32_t>(in_info.dimension(axis));
1465  *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
1466  break;
1467  }
1469  {
1470  // Subtract accumulated offsets
1471  res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
1472  *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
1473  break;
1474  }
1476  {
1477  //re-quantize result
1478  T res = 0;
1479  if(std::is_same<T, uint8_t>::value)
1480  {
1481  res = quantize_qasymm8(res_value, iq_info);
1482  }
1483  else
1484  {
1485  res = quantize_qasymm8_signed(res_value, iq_info);
1486  }
1487  *(reinterpret_cast<T *>(output.ptr() + x)) = res;
1488  break;
1489  }
1492  {
1493  *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
1494  break;
1495  }
1496  default:
1497  *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
1498  }
1499  }
1500  },
1501  input, output);
1502  }
1503 };
1504 
1505 void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
1506 {
1507  const bool is_complex = (input->info()->num_channels() == 2);
1508 
1509  if(is_complex)
1510  {
1511  switch(axis)
1512  {
1513  case 2:
1514  switch(input->info()->data_type())
1515  {
1516  case DataType::F32:
1517  switch(op)
1518  {
1520  return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
1521  default:
1522  ARM_COMPUTE_ERROR("Not supported");
1523  }
1524  default:
1525  ARM_COMPUTE_ERROR("Not supported");
1526  }
1527  default:
1528  ARM_COMPUTE_ERROR("Not supported");
1529  }
1530  }
1531 
1532  switch(axis)
1533  {
1534  case 0:
1535  switch(input->info()->data_type())
1536  {
1537  case DataType::QASYMM8:
1538  return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
1540  return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
1541 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1542  case DataType::F16:
1543  return Reducer<RedOpX<float16_t, 8>>::reduceX(window, input, output, RedOpX<float16_t, 8>(), op);
1544 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1545  case DataType::F32:
1546  return Reducer<RedOpX<float, 4>>::reduceX(window, input, output, RedOpX<float, 4>(), op);
1547  case DataType::S32:
1548  return Reducer<RedOpX<int32_t, 4>>::reduceX(window, input, output, RedOpX<int32_t, 4>(), op);
1549  default:
1550  ARM_COMPUTE_ERROR("Not supported");
1551  }
1552  case 1:
1553  switch(input->info()->data_type())
1554  {
1555  case DataType::QASYMM8:
1556  return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1558  return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1559 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1560  case DataType::F16:
1561  return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
1562 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1563  case DataType::F32:
1564  return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
1565  case DataType::S32:
1566  return Reducer<RedOpYZW<int32_t, 4>>::reduceY(window, input, output, RedOpYZW<int32_t, 4>(), op);
1567  default:
1568  ARM_COMPUTE_ERROR("Not supported");
1569  }
1570  case 2:
1571  switch(input->info()->data_type())
1572  {
1573  case DataType::QASYMM8:
1574  return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1576  return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1577 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1578  case DataType::F16:
1579  return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
1580 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1581  case DataType::F32:
1582  return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
1583  case DataType::S32:
1584  return Reducer<RedOpYZW<int32_t, 4>>::reduceZ(window, input, output, RedOpYZW<int32_t, 4>(), op);
1585  default:
1586  ARM_COMPUTE_ERROR("Not supported");
1587  }
1588  case 3:
1589  switch(input->info()->data_type())
1590  {
1591  case DataType::QASYMM8:
1592  return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
1594  return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
1595 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1596  case DataType::F16:
1597  return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
1598 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1599  case DataType::F32:
1600  return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
1601  case DataType::S32:
1602  return Reducer<RedOpYZW<int32_t, 4>>::reduceW(window, input, output, RedOpYZW<int32_t, 4>(), op);
1603  default:
1604  ARM_COMPUTE_ERROR("Not supported");
1605  }
1606  default:
1607  ARM_COMPUTE_ERROR("Unsupported reduction axis");
1608  }
1609 }
1610 
1611 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
1612 {
1613  ARM_COMPUTE_UNUSED(op);
1614 
1615  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
1617 
1618  if(input->num_channels() == 1)
1619  {
1621  }
1622  else
1623  {
1626  ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
1627  }
1628 
1629  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
1630  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
1631 
1632  if(output->total_size() != 0)
1633  {
1634  bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
1635  if(!is_arg_min_max)
1636  {
1639  ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
1640  }
1641  else
1642  {
1644  }
1645 
1646  const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
1647  const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
1648  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
1649  }
1650 
1651  return Status{};
1652 }
1653 } // namespace
1654 
1656  : _input(nullptr), _output(nullptr), _reduction_axis(0), _op(ReductionOperation::SUM_SQUARE)
1657 {
1658 }
1659 
1660 void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
1661 {
1662  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
1663 
1664  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
1665 
1666  _input = input;
1667  _output = output;
1668  _op = op;
1669  _reduction_axis = axis;
1670 
1671  // Configure kernel window
1672  Coordinates coord;
1673  coord.set_num_dimensions(input->info()->num_dimensions());
1674  input->info()->set_valid_region(ValidRegion(coord, input->info()->tensor_shape()));
1675  Window win = calculate_max_window(*input->info(), Steps());
1676  INEKernel::configure(win);
1677 
1678  // Calculate output shape and set if empty
1680  // Output auto initialization if not yet initialized
1681  const bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
1682  DataType output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
1683  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
1684  output->info()->set_valid_region(ValidRegion(coord, output_shape));
1685 }
1686 
1687 Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
1688 {
1689  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
1690 
1691  return Status{};
1692 }
1693 
1695 {
1696  ARM_COMPUTE_UNUSED(info);
1699 
1700  reduce_op(window, _input, _output, _reduction_axis, _op);
1701 }
1702 } // namespace arm_compute
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:846
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
uint8x8_t vorr(const uint8x8_t &a, const uint8x8_t &b)
Definition: orr.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
float dequantize_qasymm8(uint8_t value, const INFO_TYPE &qinfo)
Dequantize a value given an unsigned 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
Definition: Validate.h:610
ReductionOperation
Available reduction operations.
Definition: Types.h:521
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given an unsigned 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
float32x2_t vinv(const float32x2_t &a)
Definition: inv.h:47
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
Definition: movn.h:52
static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
Static function to check if given info will lead to a valid configuration of NEReductionOperationKern...
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
Coordinates of an item.
Definition: Coordinates.h:37
uint8x8_t vand(const uint8x8_t &a, const uint8x8_t &b)
Definition: and.h:39
TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims=true)
Calculate the reduced shape of a tensor given an axis.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
void configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
Set the source, destination of the kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: bsl.h:39
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
float32x4_t vinvq_f32(float32x4_t x)
Calculate reciprocal.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vclt(const uint8x8_t &a, const uint8x8_t &b)
Definition: clt.h:39
float dequantize_qasymm8_signed(int8_t value, const INFO_TYPE &qinfo)
Dequantize a value given a signed 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
Container for valid region of a window.
Definition: Types.h:188
uint8x8_t vpmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmin.h:39
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
Definition: Dimensions.h:46
DataType
Available data types.
Definition: Types.h:77
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
Describe a multidimensional execution window.
Definition: Window.h:39
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
Definition: ceq.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
uint32x2_t vqmovun(const int64x2_t &a)
Definition: qmovun.h:39