Compute Library
 20.08
NEDepthwiseConvolutionLayerNativeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
32 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
34 
35 namespace arm_compute
36 {
37 namespace
38 {
39 void pad_vectors(std::vector<int> &mult, std::vector<int> &shift, int vec_size)
40 {
41  ARM_COMPUTE_ERROR_ON(mult.size() != shift.size());
42  while(mult.size() % vec_size != 0)
43  {
44  mult.push_back(0);
45  shift.push_back(0);
46  }
47 }
48 
49 template <typename T, int S>
50 void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
51  const Size2D &dilation, const Window &window, bool has_biases)
52 {
53  using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
54  using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
55 
56  const size_t input_stride_y = input->info()->strides_in_bytes().y();
57  const size_t input_stride_z = input->info()->strides_in_bytes().z();
58  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
59  input->info()->strides_in_bytes().y();
60  const size_t weights_width = weights->info()->dimension(1);
61  const size_t weights_height = weights->info()->dimension(2);
62  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
63  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
64  const size_t conv_stride_x = conv_info.stride().first;
65  const size_t conv_stride_y = conv_info.stride().second;
66  const size_t conv_pad_left = conv_info.pad_left();
67  const size_t conv_pad_top = conv_info.pad_top();
68 
69  Window win_input = window;
70  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
71  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
72 
73  Window win_weights = win_input;
74  win_weights.set(3, Window::Dimension(0, 0, 0));
75 
76  Iterator input_it(input, win_input);
77  Iterator weights_it(weights, win_weights);
78  Iterator output_it(output, window);
79  Iterator biases_it{};
80 
81  if(has_biases)
82  {
83  biases_it = Iterator(biases, win_weights);
84  }
85 
86  execute_window_loop(window, [&](const Coordinates & id)
87  {
88  VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
89 
90  const int input_y = id.y() * conv_stride_x - conv_pad_left;
91  const int input_z = id.z() * conv_stride_y - conv_pad_top;
92  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
93 
94  auto weights_ptr = weights_it.ptr();
95  for(size_t h = 0; h < weights_height; ++h)
96  {
97  int offs = input_offset;
98  for(size_t w = 0; w < weights_width; ++w)
99  {
100  const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
101  const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
102 
103  acc = wrapper::vmla(acc, weights_vals, input_vals);
104  offs += dilation.x() * input_stride_y;
105  }
106 
107  weights_ptr += weights_stride_z;
108  input_offset += dilation.y() * input_stride_z;
109  }
110 
111  if(has_biases)
112  {
113  const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
114  acc = wrapper::vadd(acc, biases_vals);
115  }
116 
117  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
118  },
119  input_it, weights_it, biases_it, output_it);
120 }
121 
122 template <typename T>
123 void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
124  const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
125 {
126  const size_t input_stride_y = input->info()->strides_in_bytes().y();
127  const size_t input_stride_z = input->info()->strides_in_bytes().z();
128  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
129  input->info()->strides_in_bytes().y();
130  const size_t weights_width = weights->info()->dimension(1);
131  const size_t weights_height = weights->info()->dimension(2);
132  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
133  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
134  const size_t conv_stride_x = conv_info.stride().first;
135  const size_t conv_stride_y = conv_info.stride().second;
136  const size_t conv_pad_left = conv_info.pad_left();
137  const size_t conv_pad_top = conv_info.pad_top();
138 
139  Window win_input = window;
140  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
141  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
142 
143  Window win_weights = win_input;
144  win_weights.set(3, Window::Dimension(0, 0, 0));
145 
146  win_input.set_dimension_step(Window::DimX, 1);
147 
148  Iterator input_it(input, win_input);
149  Iterator weights_it(weights, win_weights);
150  Iterator output_it(output, window);
151  Iterator biases_it{};
152 
153  if(has_biases)
154  {
155  biases_it = Iterator(biases, win_weights);
156  }
157 
158  execute_window_loop(window, [&](const Coordinates & id)
159  {
160  std::vector<T> acc(depth_multiplier, static_cast<T>(0));
161 
162  const int input_y = id.y() * conv_stride_x - conv_pad_left;
163  const int input_z = id.z() * conv_stride_y - conv_pad_top;
164  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
165 
166  auto weights_ptr = weights_it.ptr();
167  for(size_t h = 0; h < weights_height; ++h)
168  {
169  int offs = input_offset;
170  for(size_t w = 0; w < weights_width; ++w)
171  {
172  const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
173 
174  for(size_t m = 0; m < depth_multiplier; ++m)
175  {
176  const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
177  acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
178  }
179 
180  offs += dilation.x() * input_stride_y;
181  }
182 
183  weights_ptr += weights_stride_z;
184  input_offset += dilation.y() * input_stride_z;
185  }
186 
187  if(has_biases)
188  {
189  for(size_t m = 0; m < depth_multiplier; ++m)
190  {
191  const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
192  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
193  }
194  }
195  else
196  {
197  for(size_t m = 0; m < depth_multiplier; ++m)
198  {
199  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
200  }
201  }
202  },
203  input_it, weights_it, biases_it, output_it);
204 }
205 
206 template <typename T, typename TW, int S>
207 void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
208  const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
209 {
210  using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
211  using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
212 
213  const size_t input_stride_y = input->info()->strides_in_bytes().y();
214  const size_t input_stride_z = input->info()->strides_in_bytes().z();
215  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
216  input->info()->strides_in_bytes().y();
217  const size_t weights_width = weights->info()->dimension(1);
218  const size_t weights_height = weights->info()->dimension(2);
219  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
220  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
221  const size_t conv_stride_x = conv_info.stride().first;
222  const size_t conv_stride_y = conv_info.stride().second;
223  const size_t conv_pad_left = conv_info.pad_left();
224  const size_t conv_pad_top = conv_info.pad_top();
225 
226  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
227  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
228  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
229  const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
230 
231  Window win_input = window;
232  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
233  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
234 
235  Window win_weights = win_input;
236  win_weights.set(3, Window::Dimension(0, 0, 0));
237 
238  Iterator input_it(input, win_input);
239  Iterator weights_it(weights, win_weights);
240  Iterator output_it(output, window);
241  Iterator biases_it{};
242 
243  if(has_biases)
244  {
245  biases_it = Iterator(biases, win_weights);
246  }
247 
248  execute_window_loop(window, [&](const Coordinates & id)
249  {
250  std::vector<int32_t> acc(S, 0);
251  std::vector<int32_t> in_sum(S, 0);
252  std::vector<int32_t> we_sum(S, 0);
253 
254  const int input_y = id.y() * conv_stride_x - conv_pad_left;
255  const int input_z = id.z() * conv_stride_y - conv_pad_top;
256  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
257 
258  auto weights_ptr = weights_it.ptr();
259  for(size_t h = 0; h < weights_height; ++h)
260  {
261  int offs = input_offset;
262  for(size_t w = 0; w < weights_width; ++w)
263  {
264  const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
265  const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * weights_stride_y));
266 
267  for(int i = 0; i < S; ++i)
268  {
269  acc.at(i) += input_vals[i] * weights_vals[i];
270  in_sum.at(i) += input_vals[i];
271  we_sum.at(i) += weights_vals[i];
272  }
273 
274  offs += dilation.x() * input_stride_y;
275  }
276 
277  weights_ptr += weights_stride_z;
278  input_offset += dilation.y() * input_stride_z;
279  }
280 
281  VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
282  for(int i = 0; i < S; ++i)
283  {
284  acc.at(i) -= in_sum.at(i) * weights_qoffset;
285  acc.at(i) -= we_sum.at(i) * input_qoffset;
286  acc.at(i) += k_offset;
287 
288  if(has_biases)
289  {
290  acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
291  }
292 
293  const int out_mul = output_multiplier.at(id.x() + i);
294  const int out_shift = output_shift.at(id.x() + i);
295  if(out_shift < 0)
296  {
297  acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
298  }
299  else
300  {
301  acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
302  }
303  out_vals[i] = static_cast<T>(utility::clamp<int32_t, T>(acc.at(i)));
304  }
305 
306  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), out_vals);
307  },
308  input_it, weights_it, biases_it, output_it);
309 }
310 
311 template <typename T, typename TW>
312 void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
313  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
314 {
315  const size_t input_stride_y = input->info()->strides_in_bytes().y();
316  const size_t input_stride_z = input->info()->strides_in_bytes().z();
317  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
318  input->info()->strides_in_bytes().y();
319  const size_t weights_width = weights->info()->dimension(1);
320  const size_t weights_height = weights->info()->dimension(2);
321  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
322  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
323  const size_t conv_stride_x = conv_info.stride().first;
324  const size_t conv_stride_y = conv_info.stride().second;
325  const size_t conv_pad_left = conv_info.pad_left();
326  const size_t conv_pad_top = conv_info.pad_top();
327 
328  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
329  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
330  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
331  const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
332 
333  Window win_input = window;
334  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
335  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
336 
337  Window win_weights = win_input;
338  win_weights.set(3, Window::Dimension(0, 0, 0));
339 
340  win_input.set_dimension_step(Window::DimX, 1);
341 
342  Iterator input_it(input, win_input);
343  Iterator weights_it(weights, win_weights);
344  Iterator output_it(output, window);
345  Iterator biases_it{};
346 
347  if(has_biases)
348  {
349  biases_it = Iterator(biases, win_weights);
350  }
351 
352  execute_window_loop(window, [&](const Coordinates & id)
353  {
354  std::vector<int32_t> acc(depth_multiplier, 0);
355  std::vector<int32_t> we_sum(depth_multiplier, 0);
356  int32_t in_sum = 0;
357 
358  const int input_y = id.y() * conv_stride_x - conv_pad_left;
359  const int input_z = id.z() * conv_stride_y - conv_pad_top;
360  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
361 
362  auto weights_ptr = weights_it.ptr();
363  for(size_t h = 0; h < weights_height; ++h)
364  {
365  int offs = input_offset;
366  for(size_t w = 0; w < weights_width; ++w)
367  {
368  const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
369 
370  for(size_t m = 0; m < depth_multiplier; ++m)
371  {
372  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
373  acc.at(m) += input_val * weights_val;
374 
375  we_sum.at(m) += weights_val;
376  }
377 
378  offs += dilation.x() * input_stride_y;
379  in_sum += input_val;
380  }
381 
382  weights_ptr += weights_stride_z;
383  input_offset += dilation.y() * input_stride_z;
384  }
385 
386  for(size_t m = 0; m < depth_multiplier; ++m)
387  {
388  acc.at(m) -= in_sum * weights_qoffset;
389  acc.at(m) -= we_sum.at(m) * input_qoffset;
390  acc.at(m) += k_offset;
391 
392  if(has_biases)
393  {
394  acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
395  }
396 
397  const int out_mul = output_multiplier.at(id.x() + m);
398  const int out_shift = output_shift.at(id.x() + m);
399  if(out_shift < 0)
400  {
401  acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
402  }
403  else
404  {
405  acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
406  }
407  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, T>(acc.at(m)));
408  }
409  },
410  input_it, weights_it, biases_it, output_it);
411 }
412 
413 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
414  const Size2D &dilation)
415 {
420  ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
421  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
422  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
423  ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
424  ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
425  ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
426 
428  {
430  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
431  }
432  else
433  {
435  }
436 
437  if(biases != nullptr)
438  {
439  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
440  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
441 
442  if(is_data_type_quantized_asymmetric(input->data_type()))
443  {
445  }
446  else
447  {
449  }
450  }
451 
452  if(output->total_size() != 0)
453  {
457  }
458 
459  return Status{};
460 }
461 
462 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
463  ITensorInfo *output, const PadStrideInfo &conv_info,
464  unsigned int depth_multiplier, const Size2D &dilation)
465 {
466  // Get convolved dimensions
468 
469  // Output auto inizialitation if not yet initialized
470  auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
471 
472  // Configure kernel window (generic)
473  const unsigned int num_elems_read_per_iteration = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
474  const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
475 
476  // Configure kernel window
477  Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
478 
479  AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
480  input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
481  AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
482  AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
483 
484  bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
485 
486  if(biases != nullptr)
487  {
488  AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
489  window_changed |= update_window_and_padding(win, biases_access);
490  }
491 
492  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
493 
494  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
495  return std::make_pair(err, win);
496 }
497 } // namespace
498 
500  : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
501 {
502 }
503 
505 {
506  return _border_size;
507 }
508 
510  const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
511 {
513  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
514 
515  _input = input;
516  _weights = weights;
517  _biases = biases;
518  _output = output;
519  _conv_info = conv_info;
520  _depth_multiplier = depth_multiplier;
521  _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
522  _dilation = dilation;
523  _has_biases = (biases != nullptr);
524 
525  if(is_data_type_quantized(_input->info()->data_type()))
526  {
527  const auto input_scale = input->info()->quantization_info().uniform().scale;
528  const auto output_scale = output->info()->quantization_info().uniform().scale;
529 
530  auto weights_scale = weights->info()->quantization_info().scale();
532  {
533  for(size_t i = 1; i < _weights->info()->dimension(0); ++i)
534  {
535  weights_scale.push_back(weights_scale.front());
536  }
537  }
538 
539  for(size_t i = 0; i < weights_scale.size(); ++i)
540  {
541  int32_t out_mult = 0;
542  int32_t out_shift = 0;
543  const float multiplier = input_scale * weights_scale.at(i) / output_scale;
544  arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
545 
546  _output_multiplier.push_back(out_mult);
547  _output_shift.push_back(out_shift);
548  }
549  }
550 
551  switch(_weights->info()->data_type())
552  {
553  case DataType::QASYMM8:
554  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8>;
555  pad_vectors(_output_multiplier, _output_shift, 8);
556  break;
558  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8>;
559  pad_vectors(_output_multiplier, _output_shift, 8);
560  break;
562  if(_input->info()->data_type() == DataType::QASYMM8)
563  {
564  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8>;
565  }
566  else
567  {
568  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8>;
569  }
570  pad_vectors(_output_multiplier, _output_shift, 8);
571  break;
572 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
573  case DataType::F16:
574  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4>;
575  pad_vectors(_output_multiplier, _output_shift, 4);
576  break;
577 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
578  case DataType::F32:
579  _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2>;
580  pad_vectors(_output_multiplier, _output_shift, 2);
581  break;
582  default:
583  ARM_COMPUTE_ERROR("Data type not supported");
584  break;
585  }
586 
587  auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
588  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
589  INEKernel::configure(win_config.second);
590 }
591 
593  unsigned int depth_multiplier,
594  const Size2D &dilation)
595 {
596  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
597  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
598  depth_multiplier, dilation)
599  .first);
600  return Status{};
601 }
602 
604 {
608 
609  (this->*_func)(window, _has_biases);
610 }
611 
612 template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
613 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
614  || std::is_same<T, float16_t>::value
615 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
616  ,
617  int >::type >
618 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
619 {
622 
623  if(_depth_multiplier == 1)
624  {
625  depthwise_loop_multiplier1_fp<T, S>(_input, _weights, _biases, _output, _conv_info, _dilation, window, has_biases);
626  }
627  else
628  {
629  depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window, has_biases);
630  }
631 }
632 
633 template <typename T, typename TW, int S, typename>
634 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window, bool has_biases)
635 {
638 
639  if(_depth_multiplier == 1)
640  {
641  depthwise_loop_multiplier1_quantized<T, TW, S>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
642  }
643  else
644  {
645  depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
646  }
647 }
648 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1121
SimpleTensor< float > w
Definition: DFT.cpp:156
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
Definition: Types.h:272
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:181
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Interface for NEON tensor.
Definition: ITensor.h:36
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps=Steps(), bool skip_border=false, BorderSize border_size=BorderSize())
Calculate the maximum window for a given tensor shape and border setting.
Definition: Helpers.cpp:28
Copyright (c) 2017-2020 Arm Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:207
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
1 channel, 1 F16 per channel
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
BorderSize border_size() const override
The size of the border for that kernel.
1 channel, 1 S32 per channel
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: Helpers.h:437
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1198
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:67
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288
quantized, asymmetric fixed-point 8-bit number unsigned
T z() const
Alias to access the size of the third dimension.
Definition: Dimensions.h:91
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
Padding and stride information class.
Definition: Types.h:689
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1143
quantized, symmetric per channel fixed-point 8-bit number
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:128
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:86
quantized, asymmetric fixed-point 8-bit number signed
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination and parameters.
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:763
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of NEDepthwiseConvolutionLa...