Compute Library
 20.02.1
NEDepthwiseConvolutionLayerNativeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2020 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
32 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp"
33 
34 namespace arm_compute
35 {
36 namespace
37 {
38 void pad_vectors(std::vector<int> &mult, std::vector<int> &shift, int vec_size)
39 {
40  ARM_COMPUTE_ERROR_ON(mult.size() != shift.size());
41  while(mult.size() % vec_size != 0)
42  {
43  mult.push_back(0);
44  shift.push_back(0);
45  }
46 }
47 
48 template <typename T, int S, bool has_biases>
49 void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
50  const Size2D &dilation, const Window &window)
51 {
52  using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
53  using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
54 
55  const size_t input_stride_y = input->info()->strides_in_bytes().y();
56  const size_t input_stride_z = input->info()->strides_in_bytes().z();
57  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
58  input->info()->strides_in_bytes().y();
59  const size_t weights_width = weights->info()->dimension(1);
60  const size_t weights_height = weights->info()->dimension(2);
61  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
62  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
63  const size_t conv_stride_x = conv_info.stride().first;
64  const size_t conv_stride_y = conv_info.stride().second;
65  const size_t conv_pad_left = conv_info.pad_left();
66  const size_t conv_pad_top = conv_info.pad_top();
67 
68  Window win_input = window;
69  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
70  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
71 
72  Window win_weights = win_input;
73  win_weights.set(3, Window::Dimension(0, 0, 0));
74 
75  Iterator input_it(input, win_input);
76  Iterator weights_it(weights, win_weights);
77  Iterator output_it(output, window);
78  Iterator biases_it{};
79 
80  if(has_biases)
81  {
82  biases_it = Iterator(biases, win_weights);
83  }
84 
85  execute_window_loop(window, [&](const Coordinates & id)
86  {
87  VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
88 
89  const int input_y = id.y() * conv_stride_x - conv_pad_left;
90  const int input_z = id.z() * conv_stride_y - conv_pad_top;
91  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
92 
93  auto weights_ptr = weights_it.ptr();
94  for(size_t h = 0; h < weights_height; ++h)
95  {
96  int offs = input_offset;
97  for(size_t w = 0; w < weights_width; ++w)
98  {
99  const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
100  const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
101 
102  acc = wrapper::vmla(acc, weights_vals, input_vals);
103  offs += dilation.x() * input_stride_y;
104  }
105 
106  weights_ptr += weights_stride_z;
107  input_offset += dilation.y() * input_stride_z;
108  }
109 
110  if(has_biases)
111  {
112  const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
113  acc = wrapper::vadd(acc, biases_vals);
114  }
115 
116  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
117  },
118  input_it, weights_it, biases_it, output_it);
119 }
120 
121 template <typename T, bool has_biases>
122 void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
123  const Size2D &dilation, unsigned int depth_multiplier, const Window &window)
124 {
125  const size_t input_stride_y = input->info()->strides_in_bytes().y();
126  const size_t input_stride_z = input->info()->strides_in_bytes().z();
127  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
128  input->info()->strides_in_bytes().y();
129  const size_t weights_width = weights->info()->dimension(1);
130  const size_t weights_height = weights->info()->dimension(2);
131  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
132  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
133  const size_t conv_stride_x = conv_info.stride().first;
134  const size_t conv_stride_y = conv_info.stride().second;
135  const size_t conv_pad_left = conv_info.pad_left();
136  const size_t conv_pad_top = conv_info.pad_top();
137 
138  Window win_input = window;
139  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
140  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
141 
142  Window win_weights = win_input;
143  win_weights.set(3, Window::Dimension(0, 0, 0));
144 
145  win_input.set_dimension_step(Window::DimX, 1);
146 
147  Iterator input_it(input, win_input);
148  Iterator weights_it(weights, win_weights);
149  Iterator output_it(output, window);
150  Iterator biases_it{};
151 
152  if(has_biases)
153  {
154  biases_it = Iterator(biases, win_weights);
155  }
156 
157  execute_window_loop(window, [&](const Coordinates & id)
158  {
159  std::vector<T> acc(depth_multiplier, static_cast<T>(0));
160 
161  const int input_y = id.y() * conv_stride_x - conv_pad_left;
162  const int input_z = id.z() * conv_stride_y - conv_pad_top;
163  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
164 
165  auto weights_ptr = weights_it.ptr();
166  for(size_t h = 0; h < weights_height; ++h)
167  {
168  int offs = input_offset;
169  for(size_t w = 0; w < weights_width; ++w)
170  {
171  const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
172 
173  for(size_t m = 0; m < depth_multiplier; ++m)
174  {
175  const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
176  acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
177  }
178 
179  offs += dilation.x() * input_stride_y;
180  }
181 
182  weights_ptr += weights_stride_z;
183  input_offset += dilation.y() * input_stride_z;
184  }
185 
186  if(has_biases)
187  {
188  for(size_t m = 0; m < depth_multiplier; ++m)
189  {
190  const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
191  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
192  }
193  }
194  else
195  {
196  for(size_t m = 0; m < depth_multiplier; ++m)
197  {
198  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
199  }
200  }
201  },
202  input_it, weights_it, biases_it, output_it);
203 }
204 
205 template <typename T, typename TW, int S, bool has_biases, bool is_per_channel>
206 void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
207  const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window)
208 {
209  using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
210  using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
211 
212  const size_t input_stride_y = input->info()->strides_in_bytes().y();
213  const size_t input_stride_z = input->info()->strides_in_bytes().z();
214  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
215  input->info()->strides_in_bytes().y();
216  const size_t weights_width = weights->info()->dimension(1);
217  const size_t weights_height = weights->info()->dimension(2);
218  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
219  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
220  const size_t conv_stride_x = conv_info.stride().first;
221  const size_t conv_stride_y = conv_info.stride().second;
222  const size_t conv_pad_left = conv_info.pad_left();
223  const size_t conv_pad_top = conv_info.pad_top();
224 
225  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
226  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
227  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
228  const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
229 
230  Window win_input = window;
231  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
232  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
233 
234  Window win_weights = win_input;
235  win_weights.set(3, Window::Dimension(0, 0, 0));
236 
237  Iterator input_it(input, win_input);
238  Iterator weights_it(weights, win_weights);
239  Iterator output_it(output, window);
240  Iterator biases_it{};
241 
242  if(has_biases)
243  {
244  biases_it = Iterator(biases, win_weights);
245  }
246 
247  execute_window_loop(window, [&](const Coordinates & id)
248  {
249  std::vector<int32_t> acc(S, 0);
250  std::vector<int32_t> in_sum(S, 0);
251  std::vector<int32_t> we_sum(S, 0);
252 
253  const int input_y = id.y() * conv_stride_x - conv_pad_left;
254  const int input_z = id.z() * conv_stride_y - conv_pad_top;
255  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
256 
257  auto weights_ptr = weights_it.ptr();
258  for(size_t h = 0; h < weights_height; ++h)
259  {
260  int offs = input_offset;
261  for(size_t w = 0; w < weights_width; ++w)
262  {
263  const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
264  const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * weights_stride_y));
265 
266  for(int i = 0; i < S; ++i)
267  {
268  acc.at(i) += input_vals[i] * weights_vals[i];
269  in_sum.at(i) += input_vals[i];
270  we_sum.at(i) += weights_vals[i];
271  }
272 
273  offs += dilation.x() * input_stride_y;
274  }
275 
276  weights_ptr += weights_stride_z;
277  input_offset += dilation.y() * input_stride_z;
278  }
279 
280  VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
281  for(int i = 0; i < S; ++i)
282  {
283  acc.at(i) -= in_sum.at(i) * weights_qoffset;
284  acc.at(i) -= we_sum.at(i) * input_qoffset;
285  acc.at(i) += k_offset;
286 
287  if(has_biases)
288  {
289  acc.at(i) += *reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t));
290  }
291 
292  const int out_mul = output_multiplier.at(id.x() + i);
293  const int out_shift = output_shift.at(id.x() + i);
294  if(out_shift < 0)
295  {
296  acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
297  }
298  else
299  {
300  acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
301  }
302  out_vals[i] = static_cast<T>(utility::clamp<int32_t, T>(acc.at(i)));
303  }
304 
305  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), out_vals);
306  },
307  input_it, weights_it, biases_it, output_it);
308 }
309 
310 template <typename T, typename TW, bool has_biases, bool is_per_channel>
311 void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
312  const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window)
313 {
314  const size_t input_stride_y = input->info()->strides_in_bytes().y();
315  const size_t input_stride_z = input->info()->strides_in_bytes().z();
316  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
317  input->info()->strides_in_bytes().y();
318  const size_t weights_width = weights->info()->dimension(1);
319  const size_t weights_height = weights->info()->dimension(2);
320  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
321  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
322  const size_t conv_stride_x = conv_info.stride().first;
323  const size_t conv_stride_y = conv_info.stride().second;
324  const size_t conv_pad_left = conv_info.pad_left();
325  const size_t conv_pad_top = conv_info.pad_top();
326 
327  const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
328  const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
329  const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
330  const int32_t k_offset = weights_width * weights_height * input_qoffset * weights_qoffset;
331 
332  Window win_input = window;
333  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
334  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
335 
336  Window win_weights = win_input;
337  win_weights.set(3, Window::Dimension(0, 0, 0));
338 
339  win_input.set_dimension_step(Window::DimX, 1);
340 
341  Iterator input_it(input, win_input);
342  Iterator weights_it(weights, win_weights);
343  Iterator output_it(output, window);
344  Iterator biases_it{};
345 
346  if(has_biases)
347  {
348  biases_it = Iterator(biases, win_weights);
349  }
350 
351  execute_window_loop(window, [&](const Coordinates & id)
352  {
353  std::vector<int32_t> acc(depth_multiplier, 0);
354  std::vector<int32_t> we_sum(depth_multiplier, 0);
355  int32_t in_sum = 0;
356 
357  const int input_y = id.y() * conv_stride_x - conv_pad_left;
358  const int input_z = id.z() * conv_stride_y - conv_pad_top;
359  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
360 
361  auto weights_ptr = weights_it.ptr();
362  for(size_t h = 0; h < weights_height; ++h)
363  {
364  int offs = input_offset;
365  for(size_t w = 0; w < weights_width; ++w)
366  {
367  const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
368 
369  for(size_t m = 0; m < depth_multiplier; ++m)
370  {
371  const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
372  acc.at(m) += input_val * weights_val;
373 
374  we_sum.at(m) += weights_val;
375  }
376 
377  offs += dilation.x() * input_stride_y;
378  in_sum += input_val;
379  }
380 
381  weights_ptr += weights_stride_z;
382  input_offset += dilation.y() * input_stride_z;
383  }
384 
385  for(size_t m = 0; m < depth_multiplier; ++m)
386  {
387  acc.at(m) -= in_sum * weights_qoffset;
388  acc.at(m) -= we_sum.at(m) * input_qoffset;
389  acc.at(m) += k_offset;
390 
391  if(has_biases)
392  {
393  acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
394  }
395 
396  const int out_mul = output_multiplier.at(id.x() + m);
397  const int out_shift = output_shift.at(id.x() + m);
398  if(out_shift < 0)
399  {
400  acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
401  }
402  else
403  {
404  acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
405  }
406  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<int32_t, T>(acc.at(m)));
407  }
408  },
409  input_it, weights_it, biases_it, output_it);
410 }
411 
412 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
413  const Size2D &dilation)
414 {
419  ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
420  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
421  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
422  ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
423  ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
424  ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
425 
427  {
430  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
431  }
432  else
433  {
435  }
436 
437  if(biases != nullptr)
438  {
439  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
440  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
441 
442  if(is_data_type_quantized_asymmetric(input->data_type()))
443  {
445  }
446  else
447  {
449  }
450  }
451 
452  if(output->total_size() != 0)
453  {
456  }
457 
458  return Status{};
459 }
460 
461 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
462  ITensorInfo *output, const PadStrideInfo &conv_info,
463  unsigned int depth_multiplier, const Size2D &dilation)
464 {
465  // Get convolved dimensions
467 
468  // Output auto inizialitation if not yet initialized
469  auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
470 
471  // Configure kernel window (generic)
472  const unsigned int num_elems_read_per_iteration = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
473  const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
474 
475  // Configure kernel window
476  Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
477 
478  AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
479  input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
480  AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
481  AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
482 
483  bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
484 
485  if(biases != nullptr)
486  {
487  AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
488  window_changed |= update_window_and_padding(win, biases_access);
489  }
490 
491  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
492 
493  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
494  return std::make_pair(err, win);
495 }
496 } // namespace
497 
499  : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift()
500 {
501 }
502 
504 {
505  return _border_size;
506 }
507 
509  const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
510 {
512  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
513 
514  _input = input;
515  _weights = weights;
516  _biases = biases;
517  _output = output;
518  _conv_info = conv_info;
519  _depth_multiplier = depth_multiplier;
520  _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
521  _dilation = dilation;
522 
523  if(is_data_type_quantized(_input->info()->data_type()))
524  {
525  const auto input_scale = input->info()->quantization_info().uniform().scale;
526  const auto output_scale = output->info()->quantization_info().uniform().scale;
527 
528  auto weights_scale = weights->info()->quantization_info().scale();
530  {
531  for(size_t i = 1; i < _weights->info()->dimension(0); ++i)
532  {
533  weights_scale.push_back(weights_scale.front());
534  }
535  }
536 
537  for(size_t i = 0; i < weights_scale.size(); ++i)
538  {
539  int32_t out_mult = 0;
540  int32_t out_shift = 0;
541  const float multiplier = input_scale * weights_scale.at(i) / output_scale;
542  arm_compute::quantization::calculate_quantized_multiplier(multiplier, &out_mult, &out_shift);
543 
544  _output_multiplier.push_back(out_mult);
545  _output_shift.push_back(out_shift);
546  }
547  }
548 
549  switch(_weights->info()->data_type())
550  {
551  case DataType::QASYMM8:
552  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8, true, false> :
553  &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t, 8, false, false>;
554  pad_vectors(_output_multiplier, _output_shift, 8);
555  break;
557  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, true, false> :
558  &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, false, false>;
559  pad_vectors(_output_multiplier, _output_shift, 8);
560  break;
562  if(_input->info()->data_type() == DataType::QASYMM8)
563  {
564  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8, true, true> :
565  &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t, 8, false, true>;
566  }
567  else
568  {
569  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, true, true> :
570  &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t, 8, false, true>;
571  }
572  pad_vectors(_output_multiplier, _output_shift, 8);
573  break;
574 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
575  case DataType::F16:
576  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4, true, false> :
577  &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t, 4, false, false>;
578  pad_vectors(_output_multiplier, _output_shift, 4);
579  break;
580 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
581  case DataType::F32:
582  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2, true, false> :
583  &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float, 2, false, false>;
584  pad_vectors(_output_multiplier, _output_shift, 2);
585  break;
586  default:
587  ARM_COMPUTE_ERROR("Data type not supported");
588  break;
589  }
590 
591  auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
592  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
593  INEKernel::configure(win_config.second);
594 }
595 
597  unsigned int depth_multiplier,
598  const Size2D &dilation)
599 {
600  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
601  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
602  depth_multiplier, dilation)
603  .first);
604  return Status{};
605 }
606 
608 {
612 
613  (this->*_func)(window);
614 }
615 
616 template < typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename std::enable_if < std::is_same<T, float>::value
617 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
618  || std::is_same<T, float16_t>::value
619 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
620  ,
621  int >::type >
622 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
623 {
626 
627  if(_depth_multiplier == 1)
628  {
629  depthwise_loop_multiplier1_fp<T, S, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, window);
630  }
631  else
632  {
633  depthwise_loop_generic_fp<T, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window);
634  }
635 }
636 
637 template <typename T, typename TW, int S, bool has_biases, bool is_per_channel, typename>
638 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
639 {
642 
643  if(_depth_multiplier == 1)
644  {
645  depthwise_loop_multiplier1_quantized<T, TW, S, has_biases, is_per_channel>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift, window);
646  }
647  else
648  {
649  depthwise_loop_generic_quantized<T, TW, has_biases, is_per_channel>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window);
650  }
651 }
652 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1117
SimpleTensor< float > w
Definition: DFT.cpp:156
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
TensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: CLTensor.cpp:41
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
Definition: Types.h:269
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
size_t dimension(size_t index) const override
Return the size of the requested dimension.
Definition: TensorInfo.h:232
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:186
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
QuantizationInfo quantization_info() const override
Get the quantization settings (scale and offset) of the tensor.
Definition: TensorInfo.h:311
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
const Strides & strides_in_bytes() const override
The strides in bytes for accessing each dimension of the tensor.
Definition: TensorInfo.h:240
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Interface for NEON tensor.
Definition: ITensor.h:36
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps=Steps(), bool skip_border=false, BorderSize border_size=BorderSize())
Calculate the maximum window for a given tensor shape and border setting.
Definition: Helpers.cpp:28
Copyright (c) 2017-2020 ARM Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:202
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:71
1 channel, 1 F16 per channel
BorderSize border_size() const override
The size of the border for that kernel.
1 channel, 1 S32 per channel
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: Helpers.h:402
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1194
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:66
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288
quantized, asymmetric fixed-point 8-bit number unsigned
T z() const
Alias to access the size of the third dimension.
Definition: Dimensions.h:91
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
Padding and stride information class.
Definition: Types.h:686
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1139
quantized, symmetric per channel fixed-point 8-bit number
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Information about executing thread and CPU.
Definition: CPPTypes.h:225
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift)
Calculate quantized representation of multiplier.
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:123
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:86
quantized, asymmetric fixed-point 8-bit number signed
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination and parameters.
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:760
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of NEDepthwiseConvolutionLa...