Compute Library
 22.11
quantized.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
25 #define SRC_CORE_NEON_KERNELS_QUANTIZED_H
26 
27 #include "arm_compute/core/Types.h"
29 #include "src/core/NEON/NEAsymm.h"
31 #include "src/core/NEON/NEMath.h"
34 #include <arm_neon.h>
35 
36 namespace arm_compute
37 {
38 namespace cpu
39 {
40 template <typename T>
41 void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
42 {
43  ARM_COMPUTE_UNUSED(dst1);
44 
45  const int window_start_x = window.x().start();
46  const int window_end_x = window.x().end();
47  const int window_step_x = 16;
48  const int window_half_step_x = window_step_x / 2;
49 
50  Window window_out = window;
51  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
52 
53  Iterator in(src, window_src);
54  Iterator out(dst0, window_out);
55 
56  using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
57  using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
58  using q16_t = typename wrapper::traits::promote_t<T>;
59  using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
60  using q32_t = typename wrapper::traits::promote_t<q16_t>;
61  using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
62 
63  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
64  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
65  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
66  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
67  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
68  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
69 
70  int pool_stride_x = 0;
71  int pool_stride_y = 0;
72  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
73  const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
74  const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
75 
76  const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
77  const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
78  const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
79 
80  const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
81  // "new_offset" doesn't have to consider the "half_scale_v" in its computation
82  // With a requantization performed in a single step there won't be uncertainties introduced
83  const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
84 
85  const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
86  const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
87  const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
88 
89  execute_window_loop(window_out, [&](const Coordinates & id)
90  {
91  const int idx_width = id.y() * pool_stride_x;
92  const int idx_height = id.z() * pool_stride_y;
93  const int pool_limit_y = pool_pad_top - idx_height;
94  const int pool_limit_x = pool_pad_left - idx_width;
95 
96  const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
97  const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
98  const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
99  const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
100 
101  int x_off = window_start_x;
102  for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
103  {
104  if(pool_info.pool_type != PoolingType::MAX)
105  {
106  q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
107  q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
108  q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
109  q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
110 
111  // Calculate scale
112  const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
113  pool_stride_y);
114 
115  // Perform pooling
116  for(int y = pool_start_y; y < pool_end_y; ++y)
117  {
118  for(int x = pool_start_x; x < pool_end_x; ++x)
119  {
120  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
121  (src->info()->strides_in_bytes().z())) + x_off);
122 
123  const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
124  const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
125  vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
126  vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
127  vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
128  vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
129  }
130  }
131 
132  if(src_qinfo != dst_qinfo)
133  {
134  const float32x4x4_t vres =
135  {
136  {
137  vcvtq_f32_q32(vres1),
138  vcvtq_f32_q32(vres2),
139  vcvtq_f32_q32(vres3),
140  vcvtq_f32_q32(vres4),
141  }
142  };
143  const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
144  // Store result
145  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
146  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
147  }
148  else
149  {
150  const float32x4_t scale_v = vdupq_n_f32(scale);
151  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
152  vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
153  vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
154  vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
155  vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
156 
157  const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
158  const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
159  // Store result
160  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
161  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
162  }
163  }
164  else
165  {
166  q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
167 
168  for(int y = pool_start_y; y < pool_end_y; ++y)
169  {
170  for(int x = pool_start_x; x < pool_end_x; ++x)
171  {
172  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
173  (src->info()->strides_in_bytes().z())) + x_off);
174  vres = wrapper::vmax(vres, data);
175  }
176  }
177 
178  // Store result
179  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
180  requant_qinfo) :
181  vres);
182  }
183  }
184 
185  if(pool_info.pool_type == PoolingType::MAX)
186  {
187  for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
188  {
189  q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
190  for(int y = pool_start_y; y < pool_end_y; ++y)
191  {
192  for(int x = pool_start_x; x < pool_end_x; ++x)
193  {
194  const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
195  (src->info()->strides_in_bytes().z())) + x_off);
196  vres = wrapper::vmax(vres, data);
197  }
198  }
199 
200  // Store result
201  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
202  (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
203  }
204  }
205 
206  // Left-overs loop
207  for(; x_off < window_end_x; ++x_off)
208  {
209  if(pool_info.pool_type != PoolingType::MAX)
210  {
211  q32_t res = static_cast<q32_t>(0.f);
212 
213  // Calculate scale
214  const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
215  pool_stride_y);
216 
217  // Perform pooling
218  for(int y = pool_start_y; y < pool_end_y; ++y)
219  {
220  for(int x = pool_start_x; x < pool_end_x; ++x)
221  {
222  const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
223  (src->info()->strides_in_bytes().z())) + x_off);
224  res += data;
225  }
226  }
227 
228  if(src_qinfo != dst_qinfo)
229  {
230  const float res_f = static_cast<float>(res);
231  const float new_scale = quant_rescale / scale;
232  const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
233 
234  // Store result
235  *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
236  }
237  else
238  {
239  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
240  res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
241 
242  // Store result
243  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
244  }
245  }
246  else
247  {
248  T res = std::numeric_limits<T>::min();
249 
250  for(int y = pool_start_y; y < pool_end_y; ++y)
251  {
252  for(int x = pool_start_x; x < pool_end_x; ++x)
253  {
254  const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
255  (src->info()->strides_in_bytes().z())) + x_off);
256  res = std::max(res, data);
257  }
258  }
259 
260  // Store result
261  if(src_qinfo != dst_qinfo)
262  {
263  const float res_f = static_cast<float>(res);
264  *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
265  }
266  else
267  {
268  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
269  }
270  }
271  }
272 
273  },
274  in, out);
275 }
276 
277 #if defined(ENABLE_NCHW_KERNELS)
278 template <typename T, typename TVec>
279 inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
280  const int pool_size, const int upper_bound_w, const int upper_bound_h,
281  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
282 {
283  int start_x = (id.x() + id_offset) * stride_x - pad_x;
284  int start_y = id.y() * stride_y - pad_y;
285  const int end_y = std::min(start_y + pool_size, upper_bound_h);
286  if(exclude_padding)
287  {
288  start_y = std::max(0, start_y);
289  }
290 
291  std::array<T, 8> elems =
292  {
293  {
294  wrapper::vgetlane(v, 0),
295  wrapper::vgetlane(v, 1),
296  wrapper::vgetlane(v, 2),
297  wrapper::vgetlane(v, 3),
298  wrapper::vgetlane(v, 4),
299  wrapper::vgetlane(v, 5),
300  wrapper::vgetlane(v, 6),
301  wrapper::vgetlane(v, 7),
302  }
303  };
304 
305  for(auto &el : elems)
306  {
307  int c_start_x = start_x;
308  const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
309  if(exclude_padding)
310  {
311  c_start_x = std::max(0, c_start_x);
312  }
313  float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
314  el *= scale;
315  start_x += step * stride_x;
316  }
317 
318  v = wrapper::vsetlane(elems[0], v, 0);
319  v = wrapper::vsetlane(elems[1], v, 1);
320  v = wrapper::vsetlane(elems[2], v, 2);
321  v = wrapper::vsetlane(elems[3], v, 3);
322  v = wrapper::vsetlane(elems[4], v, 4);
323  v = wrapper::vsetlane(elems[5], v, 5);
324  v = wrapper::vsetlane(elems[6], v, 6);
325  v = wrapper::vsetlane(elems[7], v, 7);
326 }
327 
328 template <typename T>
329 auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
330 {
331  ARM_COMPUTE_UNUSED(pad_b, pad_r);
332  T vec[16];
333  //handle reading a row out of the tensor
334  const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
335  for(int i = 0; i < 16; i++)
336  {
337  if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
338  {
339  vec[i] = *(ptr + i);
340  }
341  else
342  {
343  vec[i] = fval;
344  }
345  }
346  return wrapper::vloadq(vec);
347 }
348 
349 template <typename T, typename V, bool deinterleave>
350 inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
351 {
352  if(deinterleave)
353  {
354  for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
355  {
356  *(ptr + i * 2) = lower[i];
357  }
358  for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
359  {
360  *(ptr + 1 + i * 2) = upper[i];
361  }
362  }
363  else
364  {
365  for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
366  {
367  *(ptr + i) = lower[i];
368  }
369  for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
370  {
371  *(ptr + i + 8) = upper[i];
372  }
373  }
374 }
375 
376 template <typename T, typename V>
377 inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
378 {
379  for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
380  {
381  *(ptr + i) = v[i];
382  }
383 }
384 
385 template <typename T>
386 void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
387 {
388  ARM_COMPUTE_UNUSED(dst1);
389  Iterator in(src, window_src);
390  Iterator out(dst0, window);
391 
392  /** SIMD vector types */
393  using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
394  using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
395  using q16_t = typename wrapper::traits::promote_t<T>;
396  using q16x4_t = typename wrapper::traits::neon_vector<q16_t, 4>::type;
397  using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
398  using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
399 
400  constexpr int pool_size = 2;
401  int pool_stride_x = 0;
402  int pool_stride_y = 0;
403  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
404  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
405  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
406  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
407  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
408  const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
409  const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
410  const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
411  const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
412  const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
413  const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
414  const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
415  const bool have_different_qinfo = src_qinfo != dst_qinfo;
416 
417  const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
418  const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
419  const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
420  const int src_w = src->info()->dimension(0);
421  const int src_h = src->info()->dimension(1);
422  const int dst_w = dst0->info()->dimension(0);
423 
424  const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
425 
426  execute_window_loop(window, [&](const Coordinates & id)
427  {
428  const auto x_val = id.x() * pool_stride_x;
429  const auto y_val_0 = id.y() * pool_stride_y;
430  const auto y_val_1 = (id.y() * pool_stride_y) + 1;
431 
432  auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
433  x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
434  auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
435  x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
436 
437  q8x8_t lower_res = {};
438  q8x8_t upper_res = {};
439 
440  if(pool_info.pool_type != PoolingType::MAX)
441  {
442  const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
443  const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
444 
445  // Add rows
446  const q16x8x2_t vrsum =
447  {
448  {
449  wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
450  wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
451  }
452  };
453 
454  // Pair-wise add row data
455  const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
456  const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
457 
458  q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
459 
460  // Scale lower result
461  scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
462  pool_size, upper_bound_w, upper_bound_h,
463  pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
464  lower_res = wrapper::vmovn(res_lower);
465 
466  // Compute upper result for stride_x == 1
467  if(pool_stride_x == 1)
468  {
469  // Shifted row sum
470  const q16x8x2_t vrsum_shifted =
471  {
472  {
473  wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
474  wrapper::vext_1(vrsum.val[1], vrsum.val[1])
475  }
476  };
477 
478  // Pair-wise add shifted row
479  q16x8_t res_upper = wrapper::vcombine(
480  wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
481  wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
482 
483  // Scale upper result
484  scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
485  pool_size, upper_bound_w, upper_bound_h,
486  pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
487  upper_res = wrapper::vmovn(res_upper);
488  }
489  }
490  else
491  {
492  const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
493  lower_res = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
494  if(pool_stride_x == 1)
495  {
496  const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
497  upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
498  }
499  }
500 
501  if(have_different_qinfo)
502  {
503  const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
504  lower_res = wrapper::vgetlow(requantized_dst);
505  upper_res = wrapper::vgethigh(requantized_dst);
506  }
507  auto out_ptr = reinterpret_cast<T *>(out.ptr());
508  // Store result
509  if(pool_stride_x == 1)
510  {
511  write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
512  }
513  else
514  {
515  write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
516  }
517  },
518  in, out);
519 }
520 
521 template <typename T>
522 void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
523 {
524  ARM_COMPUTE_UNUSED(dst1);
525  Iterator in(src, window_src);
526  Iterator out(dst0, window);
527 
528  /** SIMD vector types */
529  using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
530  using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
531  using q8x8x2_t = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
532  using q16_t = typename wrapper::traits::promote_t<T>;
533  using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
534  using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
535 
536  constexpr int pool_size = 3;
537  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
538  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
539  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
540  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
541  int pool_stride_x = 0;
542  int pool_stride_y = 0;
543  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
544  const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
545  const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
546 
547  const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
548  const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
549 
550  const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
551  const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
552  const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
553 
554  const T *const src_top_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
555  const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
556  const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
557 
558  const int src_w = src->info()->dimension(0);
559  const int src_h = src->info()->dimension(1);
560  const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
561  const int dst_w = dst0->info()->dimension(0);
562 
563  execute_window_loop(window, [&](const Coordinates & id)
564  {
565  const auto x_val = id.x() * pool_stride_x;
566  const auto y_val_0 = id.y() * pool_stride_y;
567  const auto y_val_1 = (id.y() * pool_stride_y) + 1;
568  const auto y_val_2 = (id.y() * pool_stride_y) + 2;
569 
570  auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
571  x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
572  auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
573  x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
574  auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
575  x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
576 
577  q8x8_t fres = {};
578  q8x16_t fqres = {};
579 
580  if(pool_info.pool_type == PoolingType::AVG)
581  {
582  // Convert data to u16
583  const q16x8x2_t top_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
584  const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
585  const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
586 
587  // Calculate row sums
588  const q16x8x2_t vrsum =
589  {
590  {
591  wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
592  wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
593  }
594  };
595  const q16x8x2_t vrsum_shifted_1 =
596  {
597  {
598  wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
599  wrapper::vext_1(vrsum.val[1], vrsum.val[1])
600  }
601  };
602  const q16x8x2_t vrsum_shifted_2 =
603  {
604  {
605  wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
606  wrapper::vext_2(vrsum.val[1], vrsum.val[1])
607  }
608  };
609  // Calculate final sum
610  q16x8x2_t final_sum =
611  {
612  {
613  wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
614  wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
615  }
616  };
617  if(pool_stride_x == 2)
618  {
619  q16x8_t res =
620  {
621  wrapper::vgetlane(final_sum.val[0], 0),
622  wrapper::vgetlane(final_sum.val[0], 2),
623  wrapper::vgetlane(final_sum.val[0], 4),
624  wrapper::vgetlane(final_sum.val[0], 6),
625  wrapper::vgetlane(final_sum.val[1], 0),
626  wrapper::vgetlane(final_sum.val[1], 2),
627  wrapper::vgetlane(final_sum.val[1], 4),
628  wrapper::vgetlane(final_sum.val[1], 6),
629  };
630 
631  scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
632  pool_size, upper_bound_w, upper_bound_h,
633  pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
634  fres = wrapper::vmovn(res);
635  }
636  else
637  {
638  // Scale lower result
639  scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
640  pool_size, upper_bound_w, upper_bound_h,
641  pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
642  // Scale lower result
643  scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
644  pool_size, upper_bound_w, upper_bound_h,
645  pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
646  fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
647  }
648  }
649  else
650  {
651  const q8x16_t max_data = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
652  const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
653  const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
654  const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
655 
656  if(pool_stride_x == 2)
657  {
658  const q8x8x2_t table = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
659  static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
660  fres = wrapper::vtbl(table, lookup_val);
661  }
662  else
663  {
664  fqres = final_max;
665  }
666  }
667 
668  // Store result
669  if(pool_stride_x == 1)
670  {
671  if(src_qinfo != dst_qinfo)
672  {
673  fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
674  }
675  write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
676  }
677  else
678  {
679  if(src_qinfo != dst_qinfo)
680  {
681  fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
682  }
683  write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
684  }
685  },
686  in, out);
687 }
688 
689 template <typename T>
690 void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
691 {
692  ARM_COMPUTE_UNUSED(dst1);
693  Iterator in(src, window_src);
694  Iterator out(dst0, window);
695 
696  /** SIMD vector types */
697  using q16_t = typename wrapper::traits::promote_t<T>;
698  using q32_t = typename wrapper::traits::promote_t<q16_t>;
699 
700  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
701  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
702  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
703  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
704  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
705  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
706  int pool_stride_x = 0;
707  int pool_stride_y = 0;
708  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
709  const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
710  const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
711 
712  const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
713  const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
714  const int src_w = src->info()->dimension(0);
715  const int src_h = src->info()->dimension(1);
716  const T fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
717  const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
718  const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
719 
720  execute_window_loop(window, [&](const Coordinates & id)
721  {
722  T res = std::numeric_limits<T>::min();
723 
724  if(pool_info.pool_type != PoolingType::MAX)
725  {
726  q32_t sres = 0;
727 
728  // Calculate scale
729  const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
730  pool_stride_y);
731 
732  // Perform pooling
733  for(int y = 0; y < pool_size_y; ++y)
734  {
735  for(int x = 0; x < pool_size_x; ++x)
736  {
737  const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
738 
739  const int idx = x + id.x() * pool_stride_x - pool_pad_left;
740  const int idy = y + id.y() * pool_stride_y - pool_pad_top;
741  const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
742  sres += data;
743  }
744  }
745  // Divide by scale
746  res = static_cast<T>(support::cpp11::round(sres * scale));
747  }
748  else
749  {
750  for(int y = 0; y < pool_size_y; ++y)
751  {
752  for(int x = 0; x < pool_size_x; ++x)
753  {
754  const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
755 
756  const int idx = x + id.x() * pool_stride_x - pool_pad_left;
757  const int idy = y + id.y() * pool_stride_y - pool_pad_top;
758  const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
759  res = std::max(res, data);
760  }
761  }
762  }
763  // Store result
764  res = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
765  *(reinterpret_cast<T *>(out.ptr())) = res;
766  },
767  in, out);
768 }
769 #endif /* defined(ENABLE_NCHW_KERNELS) */
770 } // namespace cpu
771 } // namespace arm_compute
772 
773 #endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Cr/V/Value channel.
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8x8_t vext_2(uint8x8_t value_a, uint8x8_t value_b)
Definition: ext.h:40
Quantization info when assuming per layer quantization.
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
unsigned int pad_top() const
Get the top padding.
Definition: Types.h:753
constexpr const Dimension & z() const
Alias to access the third dimension of the window.
Definition: Window.h:177
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Definition: ITensor.h:36
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2022 Arm Limited.
size_t height
Height of the image region or rectangle.
Definition: Size2D.h:91
typename promote< T >::type promote_t
Get promoted type.
Definition: traits.h:147
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
uint8x8_t vext_1(uint8x8_t value_a, uint8x8_t value_b)
Definition: ext.h:39
Create the appropriate SIMD vector given its type and size in terms of elements.
Definition: traits.h:48
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
T z() const
Alias to access the size of the third dimension.
Definition: Dimensions.h:97
Coordinates of an item.
Definition: Coordinates.h:37
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
Definition: Types.h:717
Pooling Layer Information struct.
Definition: Types.h:1200
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
unsigned int pad_right() const
Get the right padding.
Definition: Types.h:748
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
Definition: quantized.h:41
uint8x8_t vtbl(const uint8x8x2_t &a, const uint8x8_t &b)
Definition: tbl.h:39
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
Definition: setlane.h:91
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
Num samples, channels, height, width.
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
PadStrideInfo pad_stride_info
Definition: Types.h:1288
size_t width
Width of the image region or rectangle.
Definition: Size2D.h:90
constexpr int step
Definition: fp32.cpp:35
T round(T value)
Round floating-point value with half value rounding away from zero.
Num samples, height, width, channels.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
Definition: Window.h:168
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:92
Includes all wrapper headers at once.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Definition: Helpers.inl:134
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:102
unsigned int pad_bottom() const
Get the bottom padding.
Definition: Types.h:758
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:743
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:97
Describe a multidimensional execution window.
Definition: Window.h:39
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:159