Compute Library
 22.05
impl.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
27 
28 namespace arm_compute
29 {
30 namespace cpu
31 {
32 namespace
33 {
34 constexpr auto data_layout = DataLayout::NHWC;
35 const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
36 const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
37 const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
38 
39 constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
40 constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
41 constexpr size_t vector_size = 8;
42 
43 struct DepthwiseConvolutionRunInfo
44 {
46  const uint32_t x_start;
47  const uint32_t x_end;
48  const uint32_t x_step;
49  const uint32_t x_leftover_start;
50  const size_t input_stride_y;
51  const size_t input_stride_z;
52  const size_t input_max_offset;
53  const size_t weights_width;
54  const size_t weights_height;
55  const size_t weights_stride_y;
56  const size_t weights_stride_z;
57  const size_t conv_stride_x;
58  const size_t conv_stride_y;
59  const size_t conv_pad_left;
60  const size_t conv_pad_top;
61  const size_t input_height;
62  const size_t input_width;
63  const size_t input_depth;
64 
65  DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
66  : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
67  x_start(w.x().start()),
68  x_end(w.x().end()),
69  x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
70  x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
71  input_stride_y(input.strides_in_bytes().y()),
72  input_stride_z(input.strides_in_bytes().z()),
73  input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
74  weights_width(weights.dimension(width_idx)),
75  weights_height(weights.dimension(height_idx)),
76  weights_stride_y(weights.strides_in_bytes().y()),
77  weights_stride_z(weights.strides_in_bytes().z()),
78  conv_stride_x(conv_info.stride().first),
79  conv_stride_y(conv_info.stride().second),
80  conv_pad_left(conv_info.pad_left()),
81  conv_pad_top(conv_info.pad_top()),
82  input_height(input.dimension(height_idx)),
83  input_width(input.dimension(width_idx)),
84  input_depth(input.dimension(channel_idx))
85  {
86  }
87 };
88 
89 inline int32x4_t saturating_doubling_high_mul(const int32x4_t &a, const int32_t &b)
90 {
91  return vqrdmulhq_n_s32(a, b);
92 }
93 
94 inline int32_t saturating_doubling_high_mul(const int32_t &a, const int32_t &b)
95 {
96  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
97 }
98 
99 inline int32x4_t rounding_divide_by_exp2(const int32x4_t &x, const int exponent)
100 {
101  const int32x4_t shift = vdupq_n_s32(-exponent);
102  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
103  const int32x4_t fixed = vqaddq_s32(x, fixup);
104  return vrshlq_s32(fixed, shift);
105 }
106 
107 inline int32x2_t rounding_divide_by_exp2(const int32x2_t &x, const int exponent)
108 {
109  const int32x2_t shift = vdup_n_s32(-exponent);
110  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
111  const int32x2_t fixed = vqadd_s32(x, fixup);
112  return vrshl_s32(fixed, shift);
113 }
114 
115 inline int32_t rounding_divide_by_exp2(const int32_t &x, const int exponent)
116 {
117  const int32x2_t xs = vdup_n_s32(x);
118  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
119 }
120 
121 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
122 {
123  const int32_t current_h = base_h + h * dilation.y();
124  const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
125 
126  const int32_t current_w = base_w + w * dilation.x();
127  const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
128 
129  return is_valid_h && is_valid_w;
130 }
131 
132 template <typename T>
133 void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
134  const Size2D &dilation, const Window &window, bool has_biases)
135 {
136  constexpr auto element_per_vector = vector_size / sizeof(T);
138  using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
139 
140  const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
141 
142  const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
143 
144  Window execution_window = window;
145  execution_window.set(Window::DimX, dim_single_unit_step);
146 
147  Window win_input = window;
148  win_input.set(Window::DimX, dim_manual_loop);
149  win_input.set(Window::DimY, dim_manual_loop);
150  win_input.set(Window::DimZ, dim_manual_loop);
151 
152  Window win_weights = win_input;
153  win_weights.set(Window::DimW, dim_manual_loop);
154 
155  Window win_output = window;
156  win_output.set(Window::DimX, dim_manual_loop);
157 
158  Iterator input_it(src, win_input);
159  Iterator weights_it(weights, win_weights);
160  Iterator output_it(dst, win_output);
161  Iterator biases_it{};
162 
163  if(has_biases)
164  {
165  biases_it = Iterator(biases, win_weights);
166  }
167 
168  execute_window_loop(execution_window, [&](const Coordinates & id)
169  {
170  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
171  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
172  const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
173 
174  auto const base_weights_ptr = weights_it.ptr();
175  uint32_t x = run_info.x_start;
176 
177  for(; x < run_info.x_leftover_start; x += run_info.x_step)
178  {
179  VectorType acc = zero_vector;
180  auto weights_ptr = base_weights_ptr;
181  int64_t input_offset = base_input_offset;
182 
183  for(uint32_t h = 0; h < run_info.weights_height; ++h)
184  {
185  int64_t offs = input_offset + x * sizeof(T);
186  for(uint32_t w = 0; w < run_info.weights_width; ++w)
187  {
188  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
189  const auto input_vals = is_valid_region ?
190  wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
191  zero_vector;
192  const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
193  acc = wrapper::vmla(acc, weights_vals, input_vals);
194 
195  offs += dilation.x() * run_info.input_stride_y;
196  }
197 
198  weights_ptr += run_info.weights_stride_z;
199  input_offset += dilation.y() * run_info.input_stride_z;
200  }
201 
202  if(has_biases)
203  {
204  const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
205  acc = wrapper::vadd(acc, biases_vals);
206  }
207 
208  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
209  }
210 
211  for(; x < run_info.x_end; ++x)
212  {
213  auto acc_scalar = T{ 0 };
214  auto weights_ptr = base_weights_ptr;
215  int64_t input_offset = base_input_offset;
216 
217  for(size_t h = 0; h < run_info.weights_height; ++h)
218  {
219  int64_t offs = input_offset + x * sizeof(T);
220  for(size_t w = 0; w < run_info.weights_width; ++w)
221  {
222  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
223  const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
224  const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
225 
226  acc_scalar += (input_vals * weights_vals);
227 
228  offs += dilation.x() * run_info.input_stride_y;
229  }
230 
231  weights_ptr += run_info.weights_stride_z;
232  input_offset += dilation.y() * run_info.input_stride_z;
233  }
234 
235  if(has_biases)
236  {
237  const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
238  acc_scalar += biases_vals;
239  }
240  *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
241  }
242  },
243  input_it, weights_it, biases_it, output_it);
244 }
245 
246 template <typename T>
247 void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
248  const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
249 {
250  const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
251 
252  Window execution_window = window;
253  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
254 
255  Window win_input = execution_window;
256  win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
257  win_input.set(Window::DimY, dim_manual_loop);
258  win_input.set(Window::DimZ, dim_manual_loop);
259 
260  Window win_weights = window;
261  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
262  win_weights.set(Window::DimY, dim_manual_loop);
263  win_weights.set(Window::DimZ, dim_manual_loop);
264  win_weights.set(Window::DimW, dim_manual_loop);
265 
266  Window win_output = window;
267  win_output.set_dimension_step(Window::DimX, run_info.x_step);
268 
269  Iterator input_it(src, win_input);
270  Iterator weights_it(weights, win_weights);
271  Iterator output_it(dst, win_output);
272  Iterator biases_it{};
273 
274  if(has_biases)
275  {
276  biases_it = Iterator(biases, win_weights);
277  }
278 
279  execute_window_loop(execution_window, [&](const Coordinates & id)
280  {
281  std::vector<T> acc(depth_multiplier, static_cast<T>(0));
282 
283  const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
284  const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
285  int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
286 
287  auto weights_ptr = weights_it.ptr();
288  for(size_t h = 0; h < run_info.weights_height; ++h)
289  {
290  int offs = input_offset;
291  for(size_t w = 0; w < run_info.weights_width; ++w)
292  {
293  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
294  const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
295 
296  for(size_t m = 0; m < depth_multiplier; ++m)
297  {
298  const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
299  acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
300  }
301 
302  offs += dilation.x() * run_info.input_stride_y;
303  }
304 
305  weights_ptr += run_info.weights_stride_z;
306  input_offset += dilation.y() * run_info.input_stride_z;
307  }
308 
309  if(has_biases)
310  {
311  for(size_t m = 0; m < depth_multiplier; ++m)
312  {
313  const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
314  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
315  }
316  }
317  else
318  {
319  for(size_t m = 0; m < depth_multiplier; ++m)
320  {
321  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
322  }
323  }
324  },
325  input_it, weights_it, biases_it, output_it);
326 }
327 
328 template <typename T, typename TW>
329 void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
330  const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
331 {
332  ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
333  constexpr auto element_per_vector = vector_size / sizeof(T);
335  using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
336  using AccType = int32_t;
337  using AccArrayType = std::array<AccType, element_per_vector>;
338 
339  const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
340  const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
341 
342  const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
343 
344  const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
345  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
346  const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
347  const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
348 
349  Window execution_window = window;
350  execution_window.set(Window::DimX, dim_single_unit_step);
351 
352  Window win_input = window;
353  win_input.set(Window::DimX, dim_manual_loop);
354  win_input.set(Window::DimY, dim_manual_loop);
355  win_input.set(Window::DimZ, dim_manual_loop);
356 
357  Window win_weights = win_input;
358  win_weights.set(Window::DimW, dim_manual_loop);
359 
360  Window win_output = window;
361  win_output.set(Window::DimX, dim_manual_loop);
362 
363  Iterator input_it(src, win_input);
364  Iterator weights_it(weights, win_weights);
365  Iterator output_it(dst, win_output);
366  Iterator biases_it{};
367 
368  if(has_biases)
369  {
370  biases_it = Iterator(biases, win_weights);
371  }
372 
373  execute_window_loop(execution_window, [&](const Coordinates & id)
374  {
375  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
376  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
377  const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
378  auto const base_weights_ptr = weights_it.ptr();
379  size_t x = run_info.x_start;
380 
381  for(; x < run_info.x_leftover_start; x += run_info.x_step)
382  {
383  AccArrayType acc{};
384  AccArrayType in_sum{};
385  AccArrayType we_sum{};
386 
387  auto weights_ptr = base_weights_ptr;
388  auto input_offset = base_input_offset;
389 
390  for(size_t h = 0; h < run_info.weights_height; ++h)
391  {
392  int64_t offs = input_offset + x * sizeof(T);
393  for(size_t w = 0; w < run_info.weights_width; ++w)
394  {
395  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
396  const auto input_vals = is_valid_region ?
397  wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
398  out_of_bound_vector;
399  const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
400 
401  for(size_t i = 0; i < element_per_vector; ++i)
402  {
403  acc.at(i) += input_vals[i] * weights_vals[i];
404  in_sum.at(i) += input_vals[i];
405  we_sum.at(i) += weights_vals[i];
406  }
407 
408  offs += dilation.x() * run_info.input_stride_y;
409  }
410 
411  weights_ptr += run_info.weights_stride_z;
412  input_offset += dilation.y() * run_info.input_stride_z;
413  }
414 
415  VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
416  for(size_t i = 0; i < element_per_vector; ++i)
417  {
418  acc.at(i) -= in_sum.at(i) * weights_qoffset;
419  acc.at(i) -= we_sum.at(i) * input_qoffset;
420  acc.at(i) += k_offset;
421 
422  if(has_biases)
423  {
424  acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
425  }
426 
427  const int32_t out_mul = output_multiplier.at(x + i);
428  const int32_t out_shift = output_shift.at(x + i);
429  if(out_shift < 0)
430  {
431  acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
432  }
433  else
434  {
435  acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
436  }
437  out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
438  }
439 
440  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
441  }
442 
443  // left-over
444  for(; x < run_info.x_end; ++x)
445  {
446  AccType acc = 0;
447  AccType in_sum = 0;
448  AccType we_sum = 0;
449 
450  auto weights_ptr = base_weights_ptr;
451  auto input_offset = base_input_offset;
452 
453  for(size_t h = 0; h < run_info.weights_height; ++h)
454  {
455  int64_t offs = input_offset + x * sizeof(T);
456  for(size_t w = 0; w < run_info.weights_width; ++w)
457  {
458  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
459  const auto input_val = is_valid_region ?
460  *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
461  out_of_bound_value;
462  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
463 
464  acc += input_val * weights_val;
465  in_sum += input_val;
466  we_sum += weights_val;
467 
468  offs += dilation.x() * run_info.input_stride_y;
469  }
470 
471  weights_ptr += run_info.weights_stride_z;
472  input_offset += dilation.y() * run_info.input_stride_z;
473  }
474 
475  T out_vals{ 0 };
476 
477  acc -= in_sum * weights_qoffset;
478  acc -= we_sum * input_qoffset;
479  acc += k_offset;
480 
481  if(has_biases)
482  {
483  acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
484  }
485 
486  const int32_t out_mul = output_multiplier.at(x);
487  const int32_t out_shift = output_shift.at(x);
488 
489  if(out_shift < 0)
490  {
491  acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
492  }
493  else
494  {
495  acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
496  }
497 
498  out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
499  *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
500  }
501  },
502  input_it, weights_it, biases_it, output_it);
503 }
504 
505 template <typename T, typename TW>
506 void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
507  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
508 {
509  using AccType = int32_t;
510 
511  const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
512 
513  const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
514 
515  const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
516  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
517  const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
518  const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
519 
520  Window execution_window = window;
521  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
522 
523  Window win_input = execution_window;
524  win_input.set(Window::DimY, dim_manual_loop);
525  win_input.set(Window::DimZ, dim_manual_loop);
526 
527  Window win_weights = window;
528  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
529  win_weights.set(Window::DimY, dim_manual_loop);
530  win_weights.set(Window::DimZ, dim_manual_loop);
531  win_weights.set(Window::DimW, dim_manual_loop);
532 
533  Window win_output = window;
534  win_output.set_dimension_step(Window::DimX, run_info.x_step);
535 
536  Iterator input_it(src, win_input);
537  Iterator weights_it(weights, win_weights);
538  Iterator output_it(dst, win_output);
539  Iterator biases_it{};
540 
541  if(has_biases)
542  {
543  biases_it = Iterator(biases, win_weights);
544  }
545 
546  execute_window_loop(execution_window, [&](const Coordinates & id)
547  {
548  std::vector<AccType> acc(depth_multiplier, 0);
549  std::vector<AccType> we_sum(depth_multiplier, 0);
550  AccType in_sum = 0;
551 
552  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
553  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
554  int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
555 
556  auto weights_ptr = weights_it.ptr();
557  for(size_t h = 0; h < run_info.weights_height; ++h)
558  {
559  int offs = input_offset;
560  for(size_t w = 0; w < run_info.weights_width; ++w)
561  {
562  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
563  const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
564 
565  for(size_t m = 0; m < depth_multiplier; ++m)
566  {
567  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
568  acc.at(m) += input_val * weights_val;
569 
570  we_sum.at(m) += weights_val;
571  }
572 
573  offs += dilation.x() * run_info.input_stride_y;
574  in_sum += input_val;
575  }
576 
577  weights_ptr += run_info.weights_stride_z;
578  input_offset += dilation.y() * run_info.input_stride_z;
579  }
580 
581  for(size_t m = 0; m < depth_multiplier; ++m)
582  {
583  acc.at(m) -= in_sum * weights_qoffset;
584  acc.at(m) -= we_sum.at(m) * input_qoffset;
585  acc.at(m) += k_offset;
586 
587  if(has_biases)
588  {
589  acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
590  }
591 
592  const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
593  const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
594  if(out_shift < 0)
595  {
596  acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
597  }
598  else
599  {
600  acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
601  }
602  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
603  }
604  },
605  input_it, weights_it, biases_it, output_it);
606 }
607 
608 template <typename T, typename TW>
609 void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
610  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
611 {
612  constexpr int half_vec = vector_size / 2;
613 
614  using AccType = int32_t;
615  using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
616  using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
617  using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
618 
619  const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
620 
621  const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
622  const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
623  const auto output_qoffset_vec = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
624 
625  const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
626  const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
627  const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
628 
629  const auto out_mul = output_multiplier.at(0);
630  const auto out_shift = output_shift.at(0);
631 
632  Window execution_window = window;
633  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
634 
635  Window win_input = execution_window;
636  win_input.set(Window::DimY, dim_manual_loop);
637  win_input.set(Window::DimZ, dim_manual_loop);
638 
639  Window win_weights = window;
640  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
641  win_weights.set(Window::DimY, dim_manual_loop);
642  win_weights.set(Window::DimZ, dim_manual_loop);
643  win_weights.set(Window::DimW, dim_manual_loop);
644 
645  Window win_output = window;
646  win_output.set_dimension_step(Window::DimX, run_info.x_step);
647 
648  Iterator input_it(src, win_input);
649  Iterator weights_it(weights, win_weights);
650  Iterator output_it(dst, win_output);
651  Iterator biases_it{};
652 
653  if(has_biases)
654  {
655  biases_it = Iterator(biases, win_weights);
656  }
657 
658  std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
659  std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
660 
661  execute_window_loop(execution_window, [&](const Coordinates & id)
662  {
663  std::fill(begin(acc0), end(acc0), zero);
664  std::fill(begin(acc1), end(acc1), zero);
665 
666  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
667  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
668  int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
669 
670  auto weights_ptr = weights_it.ptr();
671  for(size_t h = 0; h < run_info.weights_height; ++h)
672  {
673  const int32_t current_h = input_z + h * dilation.y();
674  if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
675  {
676  int offs = input_offset;
677  for(size_t w = 0; w < run_info.weights_width; ++w)
678  {
679  const int32_t current_w = input_y + w * dilation.x();
680  if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
681  {
682  const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
683  const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
684  const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
685 
686  for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
687  {
688  const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
689  const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
690  const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
691 
692  acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
693  acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
694  }
695  }
696 
697  offs += dilation.x() * run_info.input_stride_y;
698  }
699  }
700 
701  weights_ptr += run_info.weights_stride_z;
702  input_offset += dilation.y() * run_info.input_stride_z;
703  }
704 
705  for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
706  {
707  if(has_biases)
708  {
709  const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
710  const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
711 
712  acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
713  acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
714  }
715 
716  if(out_shift < 0)
717  {
718  acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
719  acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
720  }
721  else
722  {
723  acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
724  acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
725  }
726 
727  acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
728  acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
729 
730  const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
731  wrapper::vmovn(acc1.at(i)));
732 
733  if(std::is_same<T, uint8_t>::value)
734  {
735  wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
736  }
737  else
738  {
739  wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
740  }
741  }
742  },
743  input_it, weights_it, biases_it, output_it);
744 }
745 } // namespace
746 template <typename T, typename TW>
747 void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
748  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
749 {
751  unsigned int depth_multiplier = info.depth_multiplier;
752  Size2D dilation = info.dilation;
753 
754  if(depth_multiplier == 1)
755  {
756  depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
757  }
758  else
759  {
760  depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
761  }
762 }
763 template void run_depthwise_float<float, float>(const ITensor *src, const ITensor *weights, const ITensor *biases,
764  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
765 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
766 template void run_depthwise_float<float16_t, float16_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
767  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
768 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
769 
770 template <typename T, typename TW>
771 void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
772  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
773 {
775  unsigned int depth_multiplier = info.depth_multiplier;
776  Size2D dilation = info.dilation;
777  std::vector<int> output_multiplier;
778  std::vector<int> output_shift;
779 
780  const auto input_scale = src->info()->quantization_info().uniform().scale;
781  const auto output_scale = dst->info()->quantization_info().uniform().scale;
782  auto weights_scale = weights->info()->quantization_info().scale();
783 
785  {
786  for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
787  {
788  weights_scale.push_back(weights_scale.front());
789  }
790  }
791 
792  for(const auto &s : weights_scale)
793  {
794  int32_t out_mult = 0;
795  int32_t out_shift = 0;
796  const float multiplier = input_scale * s / output_scale;
797  arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
798 
799  output_multiplier.push_back(out_mult);
800  output_shift.push_back(out_shift);
801  }
802 
803  if(depth_multiplier == 1)
804  {
805  depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
806  }
807  else
808  {
809  const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
810  const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
811 
812  if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
813  {
814  depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
815  }
816  else
817  {
818  depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
819  }
820  }
821 }
822 template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
823  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
824 template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
825  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
826 template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
827  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
828 } // namespace cpu
829 } // namespace arm_compute
void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
Definition: impl.cpp:771
SimpleTensor< float > w
Definition: DFT.cpp:156
const size_t weights_stride_y
Definition: impl.cpp:55
template void run_depthwise_float< float, float >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:185
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
Definition: impl.cpp:747
const size_t input_height
Definition: impl.cpp:61
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
void fill(U &&tensor, int seed, AssetsLibrary *library)
Definition: Utils.h:55
const size_t conv_pad_top
Definition: impl.cpp:60
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Definition: ITensor.h:36
const size_t input_width
Definition: impl.cpp:62
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2022 Arm Limited.
const size_t input_depth
Definition: impl.cpp:63
const size_t conv_pad_left
Definition: impl.cpp:59
uint32x2_t vqmovn(const uint64x2_t &a)
Definition: movn.h:52
unsigned int depth_multiplier
Multiplier to apply to input&#39;s depth to retrieve the output depth.
Definition: Types.h:2023
const size_t conv_stride_y
Definition: impl.cpp:58
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1107
PadStrideInfo pad_stride_info
Convolution info (Pads, strides,...)
Definition: Types.h:2022
template void run_depthwise_quanitized8bit< int8_t, int8_t >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
Size2D dilation
Dilation, in elements, across x and y.
Definition: Types.h:2025
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
const uint32_t x_step
Definition: impl.cpp:48
const std::vector< float > & scale() const
Scale vector accessor.
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
Padding and stride information class.
Definition: Types.h:669
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
Definition: Window.h:49
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:76
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
const size_t input_max_offset
Definition: impl.cpp:52
const uint32_t x_end
Definition: impl.cpp:47
const uint32_t x_leftover_start
Definition: impl.cpp:49
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type...
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
const size_t input_stride_y
Definition: impl.cpp:50
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
Num samples, height, width, channels.
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
template void run_depthwise_quanitized8bit< uint8_t, int8_t >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
const size_t weights_width
Definition: impl.cpp:53
const size_t weights_stride_z
Definition: impl.cpp:56
const size_t weights_height
Definition: impl.cpp:54
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
const uint32_t x_start
Definition: impl.cpp:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
template void run_depthwise_quanitized8bit< uint8_t, uint8_t >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
const size_t conv_stride_x
Definition: impl.cpp:57
Describe a multidimensional execution window.
Definition: Window.h:39
const size_t num_read_elements_per_iteration
Definition: impl.cpp:45
const size_t input_stride_z
Definition: impl.cpp:51