Compute Library
 21.02
NEDepthwiseConvolutionLayerNativeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
29 #include "src/core/CPP/Validate.h"
30 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
36 
37 namespace arm_compute
38 {
39 namespace
40 {
41 constexpr auto data_layout = DataLayout::NHWC;
42 const size_t width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
43 const size_t height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
44 const size_t channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
45 
46 constexpr auto dim_manual_loop = Window::Dimension(0, 0, 0);
47 constexpr auto dim_single_unit_step = Window::Dimension(0, 1, 1);
48 constexpr size_t vector_size = 8;
49 
50 struct DepthwiseConvolutionRunInfo
51 {
53  const uint32_t x_start;
54  const uint32_t x_end;
55  const uint32_t x_step;
56  const uint32_t x_leftover_start;
57  const size_t input_stride_y;
58  const size_t input_stride_z;
59  const size_t input_max_offset;
60  const size_t weights_width;
61  const size_t weights_height;
62  const size_t weights_stride_y;
63  const size_t weights_stride_z;
64  const size_t conv_stride_x;
65  const size_t conv_stride_y;
66  const size_t conv_pad_left;
67  const size_t conv_pad_top;
68  const size_t input_height;
69  const size_t input_width;
70  const size_t input_depth;
71 
72  DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
73  : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
74  x_start(w.x().start()),
75  x_end(w.x().end()),
76  x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
77  x_leftover_start(std::max(static_cast<int32_t>(w.x().end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
78  input_stride_y(input.strides_in_bytes().y()),
79  input_stride_z(input.strides_in_bytes().z()),
80  input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
81  weights_width(weights.dimension(width_idx)),
82  weights_height(weights.dimension(height_idx)),
83  weights_stride_y(weights.strides_in_bytes().y()),
84  weights_stride_z(weights.strides_in_bytes().z()),
85  conv_stride_x(conv_info.stride().first),
86  conv_stride_y(conv_info.stride().second),
87  conv_pad_left(conv_info.pad_left()),
88  conv_pad_top(conv_info.pad_top()),
89  input_height(input.dimension(height_idx)),
90  input_width(input.dimension(width_idx)),
91  input_depth(input.dimension(channel_idx))
92  {
93  }
94 };
95 
96 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
97 {
98  const int32_t current_h = base_h + h * dilation.y();
99  const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
100 
101  const int32_t current_w = base_w + w * dilation.x();
102  const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
103 
104  return is_valid_h && is_valid_w;
105 }
106 
107 template <typename T>
108 void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
109  const Size2D &dilation, const Window &window, bool has_biases)
110 {
111  constexpr auto element_per_vector = vector_size / sizeof(T);
113  using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
114 
115  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
116 
117  const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
118 
119  Window execution_window = window;
120  execution_window.set(Window::DimX, dim_single_unit_step);
121 
122  Window win_input = window;
123  win_input.set(Window::DimX, dim_manual_loop);
124  win_input.set(Window::DimY, dim_manual_loop);
125  win_input.set(Window::DimZ, dim_manual_loop);
126 
127  Window win_weights = win_input;
128  win_weights.set(Window::DimW, dim_manual_loop);
129 
130  Window win_output = window;
131  win_output.set(Window::DimX, dim_manual_loop);
132 
133  Iterator input_it(input, win_input);
134  Iterator weights_it(weights, win_weights);
135  Iterator output_it(output, win_output);
136  Iterator biases_it{};
137 
138  if(has_biases)
139  {
140  biases_it = Iterator(biases, win_weights);
141  }
142 
143  execute_window_loop(execution_window, [&](const Coordinates & id)
144  {
145  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
146  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
147  const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
148 
149  auto const base_weights_ptr = weights_it.ptr();
150  uint32_t x = run_info.x_start;
151 
152  for(; x < run_info.x_leftover_start; x += run_info.x_step)
153  {
154  VectorType acc = zero_vector;
155  auto weights_ptr = base_weights_ptr;
156  int64_t input_offset = base_input_offset;
157 
158  for(uint32_t h = 0; h < run_info.weights_height; ++h)
159  {
160  int64_t offs = input_offset + x * sizeof(T);
161  for(uint32_t w = 0; w < run_info.weights_width; ++w)
162  {
163  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
164  const auto input_vals = is_valid_region ?
165  wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
166  zero_vector;
167  const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
168  acc = wrapper::vmla(acc, weights_vals, input_vals);
169 
170  offs += dilation.x() * run_info.input_stride_y;
171  }
172 
173  weights_ptr += run_info.weights_stride_z;
174  input_offset += dilation.y() * run_info.input_stride_z;
175  }
176 
177  if(has_biases)
178  {
179  const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
180  acc = wrapper::vadd(acc, biases_vals);
181  }
182 
183  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
184  }
185 
186  for(; x < run_info.x_end; ++x)
187  {
188  auto acc_scalar = T{ 0 };
189  auto weights_ptr = base_weights_ptr;
190  int64_t input_offset = base_input_offset;
191 
192  for(size_t h = 0; h < run_info.weights_height; ++h)
193  {
194  int64_t offs = input_offset + x * sizeof(T);
195  for(size_t w = 0; w < run_info.weights_width; ++w)
196  {
197  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
198  const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
199  const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
200 
201  acc_scalar += (input_vals * weights_vals);
202 
203  offs += dilation.x() * run_info.input_stride_y;
204  }
205 
206  weights_ptr += run_info.weights_stride_z;
207  input_offset += dilation.y() * run_info.input_stride_z;
208  }
209 
210  if(has_biases)
211  {
212  const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
213  acc_scalar += biases_vals;
214  }
215  *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
216  }
217  },
218  input_it, weights_it, biases_it, output_it);
219 }
220 
221 template <typename T>
222 void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
223  const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
224 {
225  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
226 
227  Window execution_window = window;
228  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
229 
230  Window win_input = execution_window;
231  win_input.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
232  win_input.set(Window::DimY, dim_manual_loop);
233  win_input.set(Window::DimZ, dim_manual_loop);
234 
235  Window win_weights = window;
236  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
237  win_weights.set(Window::DimY, dim_manual_loop);
238  win_weights.set(Window::DimZ, dim_manual_loop);
239  win_weights.set(Window::DimW, dim_manual_loop);
240 
241  Window win_output = window;
242  win_output.set_dimension_step(Window::DimX, run_info.x_step);
243 
244  Iterator input_it(input, win_input);
245  Iterator weights_it(weights, win_weights);
246  Iterator output_it(output, win_output);
247  Iterator biases_it{};
248 
249  if(has_biases)
250  {
251  biases_it = Iterator(biases, win_weights);
252  }
253 
254  execute_window_loop(execution_window, [&](const Coordinates & id)
255  {
256  std::vector<T> acc(depth_multiplier, static_cast<T>(0));
257 
258  const int input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
259  const int input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
260  int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
261 
262  auto weights_ptr = weights_it.ptr();
263  for(size_t h = 0; h < run_info.weights_height; ++h)
264  {
265  int offs = input_offset;
266  for(size_t w = 0; w < run_info.weights_width; ++w)
267  {
268  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
269  const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
270 
271  for(size_t m = 0; m < depth_multiplier; ++m)
272  {
273  const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
274  acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
275  }
276 
277  offs += dilation.x() * run_info.input_stride_y;
278  }
279 
280  weights_ptr += run_info.weights_stride_z;
281  input_offset += dilation.y() * run_info.input_stride_z;
282  }
283 
284  if(has_biases)
285  {
286  for(size_t m = 0; m < depth_multiplier; ++m)
287  {
288  const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
289  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
290  }
291  }
292  else
293  {
294  for(size_t m = 0; m < depth_multiplier; ++m)
295  {
296  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
297  }
298  }
299  },
300  input_it, weights_it, biases_it, output_it);
301 }
302 
303 template <typename T, typename TW>
304 void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
305  const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
306 {
307  constexpr auto element_per_vector = vector_size / sizeof(T);
309  using TagType = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
310  using AccType = int32_t;
311  using AccArrayType = std::array<AccType, element_per_vector>;
312 
313  const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
314  const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
315 
316  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
317 
318  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
319  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
320  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
321  const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
322 
323  Window execution_window = window;
324  execution_window.set(Window::DimX, dim_single_unit_step);
325 
326  Window win_input = window;
327  win_input.set(Window::DimX, dim_manual_loop);
328  win_input.set(Window::DimY, dim_manual_loop);
329  win_input.set(Window::DimZ, dim_manual_loop);
330 
331  Window win_weights = win_input;
332  win_weights.set(Window::DimW, dim_manual_loop);
333 
334  Window win_output = window;
335  win_output.set(Window::DimX, dim_manual_loop);
336 
337  Iterator input_it(input, win_input);
338  Iterator weights_it(weights, win_weights);
339  Iterator output_it(output, win_output);
340  Iterator biases_it{};
341 
342  if(has_biases)
343  {
344  biases_it = Iterator(biases, win_weights);
345  }
346 
347  execute_window_loop(execution_window, [&](const Coordinates & id)
348  {
349  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
350  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
351  const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
352  auto const base_weights_ptr = weights_it.ptr();
353  size_t x = run_info.x_start;
354 
355  for(; x < run_info.x_leftover_start; x += run_info.x_step)
356  {
357  AccArrayType acc{};
358  AccArrayType in_sum{};
359  AccArrayType we_sum{};
360 
361  auto weights_ptr = base_weights_ptr;
362  auto input_offset = base_input_offset;
363 
364  for(size_t h = 0; h < run_info.weights_height; ++h)
365  {
366  int64_t offs = input_offset + x * sizeof(T);
367  for(size_t w = 0; w < run_info.weights_width; ++w)
368  {
369  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
370  const auto input_vals = is_valid_region ?
371  wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
372  out_of_bound_vector;
373  const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
374 
375  for(size_t i = 0; i < element_per_vector; ++i)
376  {
377  acc.at(i) += input_vals[i] * weights_vals[i];
378  in_sum.at(i) += input_vals[i];
379  we_sum.at(i) += weights_vals[i];
380  }
381 
382  offs += dilation.x() * run_info.input_stride_y;
383  }
384 
385  weights_ptr += run_info.weights_stride_z;
386  input_offset += dilation.y() * run_info.input_stride_z;
387  }
388 
389  VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
390  for(size_t i = 0; i < element_per_vector; ++i)
391  {
392  acc.at(i) -= in_sum.at(i) * weights_qoffset;
393  acc.at(i) -= we_sum.at(i) * input_qoffset;
394  acc.at(i) += k_offset;
395 
396  if(has_biases)
397  {
398  acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
399  }
400 
401  const int32_t out_mul = output_multiplier.at(x + i);
402  const int32_t out_shift = output_shift.at(x + i);
403  if(out_shift < 0)
404  {
405  acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
406  }
407  else
408  {
409  acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
410  }
411  out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
412  }
413 
414  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
415  }
416 
417  // left-over
418  for(; x < run_info.x_end; ++x)
419  {
420  AccType acc = 0;
421  AccType in_sum = 0;
422  AccType we_sum = 0;
423 
424  auto weights_ptr = base_weights_ptr;
425  auto input_offset = base_input_offset;
426 
427  for(size_t h = 0; h < run_info.weights_height; ++h)
428  {
429  int64_t offs = input_offset + x * sizeof(T);
430  for(size_t w = 0; w < run_info.weights_width; ++w)
431  {
432  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
433  const auto input_val = is_valid_region ?
434  *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
435  out_of_bound_value;
436  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
437 
438  acc += input_val * weights_val;
439  in_sum += input_val;
440  we_sum += weights_val;
441 
442  offs += dilation.x() * run_info.input_stride_y;
443  }
444 
445  weights_ptr += run_info.weights_stride_z;
446  input_offset += dilation.y() * run_info.input_stride_z;
447  }
448 
449  T out_vals{ 0 };
450 
451  acc -= in_sum * weights_qoffset;
452  acc -= we_sum * input_qoffset;
453  acc += k_offset;
454 
455  if(has_biases)
456  {
457  acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
458  }
459 
460  const int32_t out_mul = output_multiplier.at(x);
461  const int32_t out_shift = output_shift.at(x);
462 
463  if(out_shift < 0)
464  {
465  acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
466  }
467  else
468  {
469  acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
470  }
471 
472  out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
473  *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
474  }
475  },
476  input_it, weights_it, biases_it, output_it);
477 }
478 
479 template <typename T, typename TW>
480 void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
481  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
482 {
483  using AccType = int32_t;
484 
485  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
486 
487  const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
488 
489  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
490  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
491  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
492  const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
493 
494  Window execution_window = window;
495  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
496 
497  Window win_input = execution_window;
498  win_input.set(Window::DimY, dim_manual_loop);
499  win_input.set(Window::DimZ, dim_manual_loop);
500 
501  Window win_weights = window;
502  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
503  win_weights.set(Window::DimY, dim_manual_loop);
504  win_weights.set(Window::DimZ, dim_manual_loop);
505  win_weights.set(Window::DimW, dim_manual_loop);
506 
507  Window win_output = window;
508  win_output.set_dimension_step(Window::DimX, run_info.x_step);
509 
510  Iterator input_it(input, win_input);
511  Iterator weights_it(weights, win_weights);
512  Iterator output_it(output, win_output);
513  Iterator biases_it{};
514 
515  if(has_biases)
516  {
517  biases_it = Iterator(biases, win_weights);
518  }
519 
520  execute_window_loop(execution_window, [&](const Coordinates & id)
521  {
522  std::vector<AccType> acc(depth_multiplier, 0);
523  std::vector<AccType> we_sum(depth_multiplier, 0);
524  AccType in_sum = 0;
525 
526  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
527  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
528  int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
529 
530  auto weights_ptr = weights_it.ptr();
531  for(size_t h = 0; h < run_info.weights_height; ++h)
532  {
533  int offs = input_offset;
534  for(size_t w = 0; w < run_info.weights_width; ++w)
535  {
536  const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
537  const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
538 
539  for(size_t m = 0; m < depth_multiplier; ++m)
540  {
541  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
542  acc.at(m) += input_val * weights_val;
543 
544  we_sum.at(m) += weights_val;
545  }
546 
547  offs += dilation.x() * run_info.input_stride_y;
548  in_sum += input_val;
549  }
550 
551  weights_ptr += run_info.weights_stride_z;
552  input_offset += dilation.y() * run_info.input_stride_z;
553  }
554 
555  for(size_t m = 0; m < depth_multiplier; ++m)
556  {
557  acc.at(m) -= in_sum * weights_qoffset;
558  acc.at(m) -= we_sum.at(m) * input_qoffset;
559  acc.at(m) += k_offset;
560 
561  if(has_biases)
562  {
563  acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
564  }
565 
566  const int32_t out_mul = output_multiplier.at(id.x() * depth_multiplier + m);
567  const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
568  if(out_shift < 0)
569  {
570  acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
571  }
572  else
573  {
574  acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
575  }
576  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
577  }
578  },
579  input_it, weights_it, biases_it, output_it);
580 }
581 
582 template <typename T, typename TW>
583 void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
584  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
585 {
586  constexpr int half_vec = vector_size / 2;
587 
588  using AccType = int32_t;
589  using AccVectorType = typename wrapper::traits::neon_vector<AccType, half_vec>::type;
590  using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
591  using TagType = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
592 
593  const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
594 
595  const auto input_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
596  const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
597  const auto output_qoffset_vec = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
598 
599  const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
600  const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
601  const auto zero = wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
602 
603  const auto out_mul = output_multiplier.at(0);
604  const auto out_shift = output_shift.at(0);
605 
606  Window execution_window = window;
607  execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
608 
609  Window win_input = execution_window;
610  win_input.set(Window::DimY, dim_manual_loop);
611  win_input.set(Window::DimZ, dim_manual_loop);
612 
613  Window win_weights = window;
614  win_weights.set_dimension_step(Window::DimX, run_info.x_step);
615  win_weights.set(Window::DimY, dim_manual_loop);
616  win_weights.set(Window::DimZ, dim_manual_loop);
617  win_weights.set(Window::DimW, dim_manual_loop);
618 
619  Window win_output = window;
620  win_output.set_dimension_step(Window::DimX, run_info.x_step);
621 
622  Iterator input_it(input, win_input);
623  Iterator weights_it(weights, win_weights);
624  Iterator output_it(output, win_output);
625  Iterator biases_it{};
626 
627  if(has_biases)
628  {
629  biases_it = Iterator(biases, win_weights);
630  }
631 
632  std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
633  std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
634 
635  execute_window_loop(execution_window, [&](const Coordinates & id)
636  {
637  std::fill(begin(acc0), end(acc0), zero);
638  std::fill(begin(acc1), end(acc1), zero);
639 
640  const int32_t input_y = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
641  const int32_t input_z = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
642  int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
643 
644  auto weights_ptr = weights_it.ptr();
645  for(size_t h = 0; h < run_info.weights_height; ++h)
646  {
647  const int32_t current_h = input_z + h * dilation.y();
648  if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
649  {
650  int offs = input_offset;
651  for(size_t w = 0; w < run_info.weights_width; ++w)
652  {
653  const int32_t current_w = input_y + w * dilation.x();
654  if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
655  {
656  const auto input_8x8 = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
657  const auto input_s16x8 = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
658  const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
659 
660  for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
661  {
662  const auto weights_8x8 = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
663  const auto weights_s16x8 = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
664  const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
665 
666  acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
667  acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
668  }
669  }
670 
671  offs += dilation.x() * run_info.input_stride_y;
672  }
673  }
674 
675  weights_ptr += run_info.weights_stride_z;
676  input_offset += dilation.y() * run_info.input_stride_z;
677  }
678 
679  for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
680  {
681  if(has_biases)
682  {
683  const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
684  const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
685 
686  acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
687  acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
688  }
689 
690  if(out_shift < 0)
691  {
692  acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
693  acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
694  }
695  else
696  {
697  acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
698  acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
699  }
700 
701  acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
702  acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
703 
704  const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
705  wrapper::vmovn(acc1.at(i)));
706 
707  if(std::is_same<T, uint8_t>::value)
708  {
709  wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
710  }
711  else
712  {
713  wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
714  }
715  }
716  },
717  input_it, weights_it, biases_it, output_it);
718 }
719 
720 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
721  const Size2D &dilation)
722 {
723  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
725  ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
727  ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
728  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
729  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
730  ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
731  ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
732  ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
733 
734  if(is_data_type_quantized_per_channel(weights->data_type()))
735  {
737  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
738  }
739  else
740  {
742  }
743 
744  if(biases != nullptr)
745  {
746  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
747  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
748 
749  if(is_data_type_quantized_asymmetric(input->data_type()))
750  {
752  }
753  else
754  {
756  }
757  }
758 
759  if(output->total_size() != 0)
760  {
761  const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier, dilation);
764  }
765 
766  return Status{};
767 }
768 } // namespace
769 
771  : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
772 {
773 }
774 
775 void NEDepthwiseConvolutionLayerNativeKernel::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
776  const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
777 {
778  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
779  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
780 
781  _input = input;
782  _weights = weights;
783  _biases = biases;
784  _output = output;
785  _conv_info = conv_info;
786  _depth_multiplier = depth_multiplier;
787  _dilation = dilation;
788  _has_biases = (biases != nullptr);
789 
790  if(is_data_type_quantized(_input->info()->data_type()))
791  {
792  const auto input_scale = input->info()->quantization_info().uniform().scale;
793  const auto output_scale = output->info()->quantization_info().uniform().scale;
794 
795  auto weights_scale = weights->info()->quantization_info().scale();
797  {
798  for(size_t i = 1; i < _weights->info()->dimension(channel_idx); ++i)
799  {
800  weights_scale.push_back(weights_scale.front());
801  }
802  }
803 
804  for(const auto &s : weights_scale)
805  {
806  int32_t out_mult = 0;
807  int32_t out_shift = 0;
808  const float multiplier = input_scale * s / output_scale;
809  arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
810 
811  _output_multiplier.push_back(out_mult);
812  _output_shift.push_back(out_shift);
813  }
814  }
815 
816  switch(_weights->info()->data_type())
817  {
818  case DataType::QASYMM8:
819  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
820  break;
822  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
823  break;
825  if(_input->info()->data_type() == DataType::QASYMM8)
826  {
827  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
828  }
829  else
830  {
831  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
832  }
833  break;
834 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
835  case DataType::F16:
836  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
837  break;
838 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
839  case DataType::F32:
840  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
841  break;
842  default:
843  ARM_COMPUTE_ERROR("Data type not supported");
844  break;
845  }
846 
847  const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input->info(), *weights->info(), conv_info, depth_multiplier, dilation);
848  auto_init_if_empty(*output->info(), input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
849 
850  Window win = calculate_max_window(*output->info(), Steps());
851  Coordinates coord;
852  coord.set_num_dimensions(output->info()->num_dimensions());
853  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
854  INEKernel::configure(win);
855 }
856 
858  unsigned int depth_multiplier,
859  const Size2D &dilation)
860 {
861  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
862  return Status{};
863 }
864 
866 {
867  ARM_COMPUTE_UNUSED(info);
870 
871  (this->*_func)(window, _has_biases);
872 }
873 
874 template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
875 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
876 {
879 
880  if(_depth_multiplier == 1)
881  {
882  depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
883  }
884  else
885  {
886  depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);
887  }
888 }
889 
890 template <typename T, typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
891 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
892 {
895 
896  if(_depth_multiplier == 1)
897  {
898  depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
899  }
900  else
901  {
902  const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
903  const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(_weights->info()->data_type()));
904 
905  if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
906  {
907  depthwise_loop_pow2_quantized_per_tensor<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
908  }
909  else
910  {
911  depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
912  }
913  }
914 }
915 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1168
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
SimpleTensor< float > w
Definition: DFT.cpp:156
Traits defined on Neon vectors.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:185
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
1 channel, 1 F32 per channel
const DataLayout data_layout
Definition: Im2Col.cpp:151
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Interface for Neon tensor.
Definition: ITensor.h:36
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
Definition: movn.h:52
const DataType data_type
Definition: Im2Col.cpp:150
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
library fill(src, distribution, 0)
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1245
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
Coordinates of an item.
Definition: Coordinates.h:37
UniformQuantizationInfo uniform() const
Return per layer quantization info.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
const std::vector< float > & scale() const
Scale vector accessor.
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
Padding and stride information class.
Definition: Types.h:722
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
Definition: Window.h:49
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:76
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1190
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type...
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
const size_t num_read_elements_per_iteration
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
Container for valid region of a window.
Definition: Types.h:188
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Initialize the function&#39;s source, destination and parameters.
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
const uint32_t x_leftover_start
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of NEDepthwiseConvolutionLa...