Compute Library
 21.05
CpuDepthwiseConvolutionNativeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
30 #include "src/core/CPP/Validate.h"
31 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
37 
38 namespace arm_compute
39 {
40 namespace cpu
41 {
42 namespace kernels
43 {
44 namespace
45 {
46 constexpr auto data_layout = DataLayout::NHWC;
50 
51 constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
52 constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
53 constexpr size_t vector_size = 8;
54 
55 struct DepthwiseConvolutionRunInfo
56 {
58  const uint32_t x_start;
59  const uint32_t x_end;
60  const uint32_t x_step;
61  const uint32_t x_leftover_start;
62  const size_t input_stride_y;
63  const size_t input_stride_z;
64  const size_t input_max_offset;
65  const size_t weights_width;
66  const size_t weights_height;
67  const size_t weights_stride_y;
68  const size_t weights_stride_z;
69  const size_t conv_stride_x;
70  const size_t conv_stride_y;
71  const size_t conv_pad_left;
72  const size_t conv_pad_top;
73  const size_t input_height;
74  const size_t input_width;
75  const size_t input_depth;
76 
77  DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
78  : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
79  x_start(w.x().start()),
80  x_end(w.x().end()),
81  x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
82  x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
83  input_stride_y(input.strides_in_bytes().y()),
84  input_stride_z(input.strides_in_bytes().z()),
85  input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
86  weights_width(weights.dimension(width_idx)),
87  weights_height(weights.dimension(height_idx)),
88  weights_stride_y(weights.strides_in_bytes().y()),
89  weights_stride_z(weights.strides_in_bytes().z()),
90  conv_stride_x(conv_info.stride().first),
91  conv_stride_y(conv_info.stride().second),
92  conv_pad_left(conv_info.pad_left()),
93  conv_pad_top(conv_info.pad_top()),
94  input_height(input.dimension(height_idx)),
95  input_width(input.dimension(width_idx)),
96  input_depth(input.dimension(channel_idx))
97  {
98  }
99 };
100 
101 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
102 {
103  const int32_t current_h = base_h + h * dilation.y();
104  const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
105 
106  const int32_t current_w = base_w + w * dilation.x();
107  const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
108 
109  return is_valid_h && is_valid_w;
110 }
111 
112 template <typename T>
113 void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
114  const Size2D &dilation, const Window &window, bool has_biases)
115 {
116  constexpr auto element_per_vector = vector_size / sizeof(T);
118  using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
119 
120  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
121 
122  const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
123 
124  Window execution_window = window;
125  execution_window.set(Window::DimX, dim_single_unit_step);
126 
127  Window win_input = window;
128  win_input.set(Window::DimX, dim_manual_loop);
129  win_input.set(Window::DimY, dim_manual_loop);
130  win_input.set(Window::DimZ, dim_manual_loop);
131 
132  Window win_weights = win_input;
133  win_weights.set(Window::DimW, dim_manual_loop);
134 
135  Window win_output = window;
136  win_output.set(Window::DimX, dim_manual_loop);
137 
138  Iterator input_it(input, win_input);
139  Iterator weights_it(weights, win_weights);
140  Iterator output_it(output, win_output);
141  Iterator biases_it{};
142 
143  if(has_biases)
144  {
145  biases_it = Iterator(biases, win_weights);
146  }
147 
148  execute_window_loop(execution_window, [&](const Coordinates & id)
149  {
150  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
151  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
152  const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
153 
154  auto const base_weights_ptr = weights_it.ptr();
155  uint32_t x = run_info.x_start;
156 
157  for(; x < run_info.x_leftover_start; x += run_info.x_step)
158  {
159  VectorType acc = zero_vector;
160  auto weights_ptr = base_weights_ptr;
161  int64_t input_offset = base_input_offset;
162 
163  for(uint32_t h = 0; h < run_info.weights_height; ++h)
164  {
165  int64_t offs = input_offset + x * sizeof(T);
166  for(uint32_t w = 0; w < run_info.weights_width; ++w)
167  {
168  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
169  const auto input_vals = is_valid_region ?
170  wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
171  zero_vector;
172  const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
173  acc = wrapper::vmla(acc, weights_vals, input_vals);
174 
175  offs += dilation.x() * run_info.input_stride_y;
176  }
177 
178  weights_ptr += run_info.weights_stride_z;
179  input_offset += dilation.y() * run_info.input_stride_z;
180  }
181 
182  if(has_biases)
183  {
184  const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
185  acc = wrapper::vadd(acc, biases_vals);
186  }
187 
188  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
189  }
190 
191  for(; x < run_info.x_end; ++x)
192  {
193  auto acc_scalar = T{ 0 };
194  auto weights_ptr = base_weights_ptr;
195  int64_t input_offset = base_input_offset;
196 
197  for(size_t h = 0; h < run_info.weights_height; ++h)
198  {
199  int64_t offs = input_offset + x * sizeof(T);
200  for(size_t w = 0; w < run_info.weights_width; ++w)
201  {
202  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
203  const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
204  const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
205 
206  acc_scalar += (input_vals * weights_vals);
207 
208  offs += dilation.x() * run_info.input_stride_y;
209  }
210 
211  weights_ptr += run_info.weights_stride_z;
212  input_offset += dilation.y() * run_info.input_stride_z;
213  }
214 
215  if(has_biases)
216  {
217  const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
218  acc_scalar += biases_vals;
219  }
220  *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
221  }
222  },
223  input_it, weights_it, biases_it, output_it);
224 }
225 
226 template <typename T>
227 void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
228  const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
229 {
230  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
231 
232  Window execution_window = window;
233  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
234 
235  Window win_input = execution_window;
236  win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
237  win_input.set(Window::DimY, dim_manual_loop);
238  win_input.set(Window::DimZ, dim_manual_loop);
239 
240  Window win_weights = window;
241  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
242  win_weights.set(Window::DimY, dim_manual_loop);
243  win_weights.set(Window::DimZ, dim_manual_loop);
244  win_weights.set(Window::DimW, dim_manual_loop);
245 
246  Window win_output = window;
247  win_output.set_dimension_step(Window::DimX, run_info.x_step);
248 
249  Iterator input_it(input, win_input);
250  Iterator weights_it(weights, win_weights);
251  Iterator output_it(output, win_output);
252  Iterator biases_it{};
253 
254  if(has_biases)
255  {
256  biases_it = Iterator(biases, win_weights);
257  }
258 
259  execute_window_loop(execution_window, [&](const Coordinates & id)
260  {
261  std::vector<T> acc(depth_multiplier, static_cast<T>(0));
262 
263  const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
264  const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
265  int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
266 
267  auto weights_ptr = weights_it.ptr();
268  for(size_t h = 0; h < run_info.weights_height; ++h)
269  {
270  int offs = input_offset;
271  for(size_t w = 0; w < run_info.weights_width; ++w)
272  {
273  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
274  const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
275 
276  for(size_t m = 0; m < depth_multiplier; ++m)
277  {
278  const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
279  acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
280  }
281 
282  offs += dilation.x() * run_info.input_stride_y;
283  }
284 
285  weights_ptr += run_info.weights_stride_z;
286  input_offset += dilation.y() * run_info.input_stride_z;
287  }
288 
289  if(has_biases)
290  {
291  for(size_t m = 0; m < depth_multiplier; ++m)
292  {
293  const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
294  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
295  }
296  }
297  else
298  {
299  for(size_t m = 0; m < depth_multiplier; ++m)
300  {
301  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
302  }
303  }
304  },
305  input_it, weights_it, biases_it, output_it);
306 }
307 
308 template <typename T, typename TW>
309 void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
310  const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
311 {
312  constexpr auto element_per_vector = vector_size / sizeof(T);
314  using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
315  using AccType = int32_t;
316  using AccArrayType = std::array<AccType, element_per_vector>;
317 
318  const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
319  const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
320 
321  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
322 
323  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
324  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
325  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
326  const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
327 
328  Window execution_window = window;
329  execution_window.set(Window::DimX, dim_single_unit_step);
330 
331  Window win_input = window;
332  win_input.set(Window::DimX, dim_manual_loop);
333  win_input.set(Window::DimY, dim_manual_loop);
334  win_input.set(Window::DimZ, dim_manual_loop);
335 
336  Window win_weights = win_input;
337  win_weights.set(Window::DimW, dim_manual_loop);
338 
339  Window win_output = window;
340  win_output.set(Window::DimX, dim_manual_loop);
341 
342  Iterator input_it(input, win_input);
343  Iterator weights_it(weights, win_weights);
344  Iterator output_it(output, win_output);
345  Iterator biases_it{};
346 
347  if(has_biases)
348  {
349  biases_it = Iterator(biases, win_weights);
350  }
351 
352  execute_window_loop(execution_window, [&](const Coordinates & id)
353  {
354  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
355  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
356  const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
357  auto const base_weights_ptr = weights_it.ptr();
358  size_t x = run_info.x_start;
359 
360  for(; x < run_info.x_leftover_start; x += run_info.x_step)
361  {
362  AccArrayType acc{};
363  AccArrayType in_sum{};
364  AccArrayType we_sum{};
365 
366  auto weights_ptr = base_weights_ptr;
367  auto input_offset = base_input_offset;
368 
369  for(size_t h = 0; h < run_info.weights_height; ++h)
370  {
371  int64_t offs = input_offset + x * sizeof(T);
372  for(size_t w = 0; w < run_info.weights_width; ++w)
373  {
374  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
375  const auto input_vals = is_valid_region ?
376  wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
377  out_of_bound_vector;
378  const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
379 
380  for(size_t i = 0; i < element_per_vector; ++i)
381  {
382  acc.at(i) += input_vals[i] * weights_vals[i];
383  in_sum.at(i) += input_vals[i];
384  we_sum.at(i) += weights_vals[i];
385  }
386 
387  offs += dilation.x() * run_info.input_stride_y;
388  }
389 
390  weights_ptr += run_info.weights_stride_z;
391  input_offset += dilation.y() * run_info.input_stride_z;
392  }
393 
394  VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
395  for(size_t i = 0; i < element_per_vector; ++i)
396  {
397  acc.at(i) -= in_sum.at(i) * weights_qoffset;
398  acc.at(i) -= we_sum.at(i) * input_qoffset;
399  acc.at(i) += k_offset;
400 
401  if(has_biases)
402  {
403  acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
404  }
405 
406  const int32_t out_mul = output_multiplier.at(x + i);
407  const int32_t out_shift = output_shift.at(x + i);
408  if(out_shift < 0)
409  {
410  acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
411  }
412  else
413  {
414  acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
415  }
416  out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
417  }
418 
419  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
420  }
421 
422  // left-over
423  for(; x < run_info.x_end; ++x)
424  {
425  AccType acc = 0;
426  AccType in_sum = 0;
427  AccType we_sum = 0;
428 
429  auto weights_ptr = base_weights_ptr;
430  auto input_offset = base_input_offset;
431 
432  for(size_t h = 0; h < run_info.weights_height; ++h)
433  {
434  int64_t offs = input_offset + x * sizeof(T);
435  for(size_t w = 0; w < run_info.weights_width; ++w)
436  {
437  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
438  const auto input_val = is_valid_region ?
439  *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
440  out_of_bound_value;
441  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
442 
443  acc += input_val * weights_val;
444  in_sum += input_val;
445  we_sum += weights_val;
446 
447  offs += dilation.x() * run_info.input_stride_y;
448  }
449 
450  weights_ptr += run_info.weights_stride_z;
451  input_offset += dilation.y() * run_info.input_stride_z;
452  }
453 
454  T out_vals{ 0 };
455 
456  acc -= in_sum * weights_qoffset;
457  acc -= we_sum * input_qoffset;
458  acc += k_offset;
459 
460  if(has_biases)
461  {
462  acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
463  }
464 
465  const int32_t out_mul = output_multiplier.at(x);
466  const int32_t out_shift = output_shift.at(x);
467 
468  if(out_shift < 0)
469  {
470  acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
471  }
472  else
473  {
474  acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
475  }
476 
477  out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
478  *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
479  }
480  },
481  input_it, weights_it, biases_it, output_it);
482 }
483 
484 template <typename T, typename TW>
485 void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
486  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
487 {
488  using AccType = int32_t;
489 
490  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
491 
492  const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
493 
494  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
495  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
496  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
497  const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
498 
499  Window execution_window = window;
500  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
501 
502  Window win_input = execution_window;
503  win_input.set(Window::DimY, dim_manual_loop);
504  win_input.set(Window::DimZ, dim_manual_loop);
505 
506  Window win_weights = window;
507  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
508  win_weights.set(Window::DimY, dim_manual_loop);
509  win_weights.set(Window::DimZ, dim_manual_loop);
510  win_weights.set(Window::DimW, dim_manual_loop);
511 
512  Window win_output = window;
513  win_output.set_dimension_step(Window::DimX, run_info.x_step);
514 
515  Iterator input_it(input, win_input);
516  Iterator weights_it(weights, win_weights);
517  Iterator output_it(output, win_output);
518  Iterator biases_it{};
519 
520  if(has_biases)
521  {
522  biases_it = Iterator(biases, win_weights);
523  }
524 
525  execute_window_loop(execution_window, [&](const Coordinates & id)
526  {
527  std::vector<AccType> acc(depth_multiplier, 0);
528  std::vector<AccType> we_sum(depth_multiplier, 0);
529  AccType in_sum = 0;
530 
531  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
532  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
533  int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
534 
535  auto weights_ptr = weights_it.ptr();
536  for(size_t h = 0; h < run_info.weights_height; ++h)
537  {
538  int offs = input_offset;
539  for(size_t w = 0; w < run_info.weights_width; ++w)
540  {
541  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
542  const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
543 
544  for(size_t m = 0; m < depth_multiplier; ++m)
545  {
546  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
547  acc.at(m) += input_val * weights_val;
548 
549  we_sum.at(m) += weights_val;
550  }
551 
552  offs += dilation.x() * run_info.input_stride_y;
553  in_sum += input_val;
554  }
555 
556  weights_ptr += run_info.weights_stride_z;
557  input_offset += dilation.y() * run_info.input_stride_z;
558  }
559 
560  for(size_t m = 0; m < depth_multiplier; ++m)
561  {
562  acc.at(m) -= in_sum * weights_qoffset;
563  acc.at(m) -= we_sum.at(m) * input_qoffset;
564  acc.at(m) += k_offset;
565 
566  if(has_biases)
567  {
568  acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
569  }
570 
571  const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
572  const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
573  if(out_shift < 0)
574  {
575  acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
576  }
577  else
578  {
579  acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
580  }
581  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
582  }
583  },
584  input_it, weights_it, biases_it, output_it);
585 }
586 
587 template <typename T, typename TW>
588 void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
589  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
590 {
591  constexpr int half_vec = vector_size / 2;
592 
593  using AccType = int32_t;
594  using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
595  using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
596  using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
597 
598  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
599 
600  const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
601  const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
602  const auto output_qoffset_vec = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
603 
604  const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
605  const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
606  const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
607 
608  const auto out_mul = output_multiplier.at(0);
609  const auto out_shift = output_shift.at(0);
610 
611  Window execution_window = window;
612  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
613 
614  Window win_input = execution_window;
615  win_input.set(Window::DimY, dim_manual_loop);
616  win_input.set(Window::DimZ, dim_manual_loop);
617 
618  Window win_weights = window;
619  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
620  win_weights.set(Window::DimY, dim_manual_loop);
621  win_weights.set(Window::DimZ, dim_manual_loop);
622  win_weights.set(Window::DimW, dim_manual_loop);
623 
624  Window win_output = window;
625  win_output.set_dimension_step(Window::DimX, run_info.x_step);
626 
627  Iterator input_it(input, win_input);
628  Iterator weights_it(weights, win_weights);
629  Iterator output_it(output, win_output);
630  Iterator biases_it{};
631 
632  if(has_biases)
633  {
634  biases_it = Iterator(biases, win_weights);
635  }
636 
637  std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
638  std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
639 
640  execute_window_loop(execution_window, [&](const Coordinates & id)
641  {
642  std::fill(begin(acc0), end(acc0), zero);
643  std::fill(begin(acc1), end(acc1), zero);
644 
645  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
646  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
647  int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
648 
649  auto weights_ptr = weights_it.ptr();
650  for(size_t h = 0; h < run_info.weights_height; ++h)
651  {
652  const int32_t current_h = input_z + h * dilation.y();
653  if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
654  {
655  int offs = input_offset;
656  for(size_t w = 0; w < run_info.weights_width; ++w)
657  {
658  const int32_t current_w = input_y + w * dilation.x();
659  if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
660  {
661  const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
662  const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
663  const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
664 
665  for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
666  {
667  const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
668  const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
669  const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
670 
671  acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
672  acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
673  }
674  }
675 
676  offs += dilation.x() * run_info.input_stride_y;
677  }
678  }
679 
680  weights_ptr += run_info.weights_stride_z;
681  input_offset += dilation.y() * run_info.input_stride_z;
682  }
683 
684  for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
685  {
686  if(has_biases)
687  {
688  const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
689  const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
690 
691  acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
692  acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
693  }
694 
695  if(out_shift < 0)
696  {
697  acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
698  acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
699  }
700  else
701  {
702  acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
703  acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
704  }
705 
706  acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
707  acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
708 
709  const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
710  wrapper::vmovn(acc1.at(i)));
711 
712  if(std::is_same<T, uint8_t>::value)
713  {
714  wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
715  }
716  else
717  {
718  wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
719  }
720  }
721  },
722  input_it, weights_it, biases_it, output_it);
723 }
724 
725 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
726 {
727  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
731  ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
732  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
733  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
734  ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
735  ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
736  ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
737 
738  if(is_data_type_quantized_per_channel(weights->data_type()))
739  {
741  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
742  }
743  else
744  {
746  }
747 
748  if(biases != nullptr)
749  {
750  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
751  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
752 
753  if(is_data_type_quantized_asymmetric(input->data_type()))
754  {
756  }
757  else
758  {
760  }
761  }
762 
763  if(output->total_size() != 0)
764  {
768  }
769 
770  return Status{};
771 }
772 } // namespace
773 
775  : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
776 {
777 }
778 
780 {
781  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
782  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
783 
784  _conv_info = info.pad_stride_info;
785  _depth_multiplier = info.depth_multiplier;
786  _dilation = info.dilation;
787  _has_biases = (biases != nullptr);
788 
789  if(is_data_type_quantized(input->data_type()))
790  {
791  const auto input_scale = input->quantization_info().uniform().scale;
792  const auto output_scale = output->quantization_info().uniform().scale;
793 
794  auto weights_scale = weights->quantization_info().scale();
796  {
797  for(size_t i = 1; i < weights->dimension(channel_idx); ++i)
798  {
799  weights_scale.push_back(weights_scale.front());
800  }
801  }
802 
803  for(const auto &s : weights_scale)
804  {
805  int32_t out_mult = 0;
806  int32_t out_shift = 0;
807  const float multiplier = input_scale * s / output_scale;
808  arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
809 
810  _output_multiplier.push_back(out_mult);
811  _output_shift.push_back(out_shift);
812  }
813  }
814 
815  switch(weights->data_type())
816  {
817  case DataType::QASYMM8:
818  _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
819  break;
821  _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
822  break;
824  if(input->data_type() == DataType::QASYMM8)
825  {
826  _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
827  }
828  else
829  {
830  _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
831  }
832  break;
833 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
834  case DataType::F16:
835  _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
836  break;
837 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
838  case DataType::F32:
839  _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
840  break;
841  default:
842  ARM_COMPUTE_ERROR("Data type not supported");
843  break;
844  }
845 
847  auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
848 
849  Window win = calculate_max_window(*output, Steps());
850  ICpuKernel::configure(win);
851 }
852 
854 {
855  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
856  return Status{};
857 }
858 
859 template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
860 void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
861  ITensor *dst, const Window &window, bool has_biases)
862 {
865 
866  if(_depth_multiplier == 1)
867  {
868  depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, _conv_info, _dilation, window, has_biases);
869  }
870  else
871  {
872  depthwise_loop_generic_fp<T>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
873  }
874 }
875 
876 template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
877 void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
878  ITensor *dst, const Window &window, bool has_biases)
879 {
882 
883  if(_depth_multiplier == 1)
884  {
885  depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
886  }
887  else
888  {
889  const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
890  const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
891 
892  if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
893  {
894  depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
895  }
896  else
897  {
898  depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
899  }
900  }
901 }
902 
904 {
908  ARM_COMPUTE_ERROR_ON(_func == nullptr);
909 
910  const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
911  const auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
912  const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
913  auto dst = tensors.get_tensor(TensorType::ACL_DST);
914  (this->*_func)(src, weights, biases, dst, window, _has_biases);
915 }
916 } // namespace kernels
917 } // namespace cpu
918 } // namespace arm_compute
const size_t weights_stride_z
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:967
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
SimpleTensor< float > w
Definition: DFT.cpp:156
Traits defined on Arm® Neon™ vectors.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
const size_t conv_pad_left
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
const size_t conv_stride_x
const uint32_t x_start
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:185
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
const DataLayout data_layout
Definition: Im2Col.cpp:151
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status class.
Definition: Error.h:52
const size_t conv_stride_y
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Definition: ITensor.h:36
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:284
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
const size_t num_read_elements_per_iteration
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:159
1 channel, 1 S32 per channel
const uint32_t x_end
uint32x2_t vqmovn(const uint64x2_t &a)
Definition: movn.h:52
const DataType data_type
Definition: Im2Col.cpp:150
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:45
const size_t input_stride_y
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
library fill(src, distribution, 0)
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1044
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
const size_t input_width
const uint32_t x_leftover_start
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
const std::vector< float > & scale() const
Scale vector accessor.
const size_t input_height
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
Definition: Window.h:49
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:76
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:989
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
Initialize the function's source, destination and parameters.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
const size_t weights_width
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
const size_t weights_height
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:55
Information about executing thread and CPU.
Definition: CPPTypes.h:252
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
Static function to check if given info will lead to a valid configuration of CpuDepthwiseConvolutionN...
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
const size_t weights_stride_y
const size_t input_stride_z
const size_t input_depth
const size_t input_max_offset
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Tensor packing service.
Definition: ITensorPack.h:37
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
const size_t conv_pad_top
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
const uint32_t x_step
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201