Compute Library
 20.08
NEDirectConvolutionLayerOutputStageKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Error.h"
34 #include "arm_compute/core/Types.h"
38 
39 #include <arm_neon.h>
40 #include <cstddef>
41 #include <cstdint>
42 
43 namespace arm_compute
44 {
45 namespace
46 {
47 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
48  const DirectConvolutionLayerOutputStageKernelInfo &info)
49 {
54 
55  if(bias != nullptr)
56  {
59  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
60  }
61 
62  if(input->data_type() == DataType::S32)
63  {
64  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output == nullptr, "In-place computation not allowed for quantized output");
65  }
66 
67  // Checks performed when output is configured
68  if((output != nullptr) && (output->total_size() != 0))
69  {
70  if(is_data_type_float(input->data_type()))
71  {
73  }
74  else
75  {
77  }
79  }
80  else if(input->data_type() == DataType::S32)
81  {
82  // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
83  ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
84  }
85 
86  return Status{};
87 }
88 
89 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output,
90  const DirectConvolutionLayerOutputStageKernelInfo &info)
91 {
93 
94  const DataType data_type = input->data_type();
95 
96  // Auto-initialize output output if required
97  if(output != nullptr)
98  {
99  // Work out expected output data type
100  const DataType output_dt = (data_type == DataType::S32) ? info.output_data_type : data_type;
101  // Output tensor auto initialization if not yet initialized
102  auto_init_if_empty(*output, input->clone()->set_data_type(output_dt));
103  }
104 
105  bool window_changed = false;
107 
108  // Update processed elements when input is S32 (comes from quantization input)
109  if(data_type == DataType::S32)
110  {
112  }
113 
114  // Configure kernel window
116  AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
117 
118  if(output != nullptr && (output->total_size() != 0))
119  {
120  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
121 
122  if(bias == nullptr)
123  {
124  window_changed = update_window_and_padding(win, input_access, output_access);
125  }
126  else
127  {
128  AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
129  window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
130  }
131 
132  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
133  }
134  else
135  {
136  if(bias == nullptr)
137  {
138  window_changed = update_window_and_padding(win, input_access);
139  }
140  else
141  {
142  if(input->data_layout() == DataLayout::NCHW)
143  {
144  AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
145  window_changed = update_window_and_padding(win, input_access, bias_access);
146  }
147  else
148  {
149  AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
150  window_changed = update_window_and_padding(win, input_access, bias_access);
151  }
152  }
153 
154  input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
155  }
156 
157  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
158  return std::make_pair(err, win);
159 }
160 
161 template <typename T, bool has_bias>
162 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
163 output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
164  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
165 {
166  /** NEON vector tag type. */
167  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
168 
169  ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
170  ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
171  ARM_COMPUTE_UNUSED(result_shift);
172  ARM_COMPUTE_UNUSED(result_offset_after_shift);
173 
174  Iterator in(input, window);
175  Iterator out(output, window);
176  execute_window_loop(window, [&](const Coordinates & id)
177  {
178  // Get bias and pointer to input
179  const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
180  auto v_in = wrapper::vloadq(in_ptr);
181 
182  // Accumulate bias
183  if(has_bias)
184  {
185  const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
186  v_in = wrapper::vadd(v_in, vb);
187  }
188 
189  const auto out_ptr = reinterpret_cast<T *>(out.ptr());
190  wrapper::vstore(out_ptr, v_in);
191  },
192  in, out);
193 }
194 
195 template <typename T, bool has_bias>
196 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
197 output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
198  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
199 {
200  ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
201  ARM_COMPUTE_UNUSED(result_shift);
202  ARM_COMPUTE_UNUSED(result_offset_after_shift);
203 
204  Window window_bias = window;
205  window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
206  window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
207  window_bias.set(3, Window::Dimension(0, 0, 0));
208 
209  Iterator in(input, window);
210  Iterator bi(bias, window_bias);
211  Iterator out(output, window);
212  execute_window_loop(window, [&](const Coordinates &)
213  {
214  // Get bias and pointer to input
215  const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
216  auto v_in = wrapper::vloadq(in_ptr);
217 
218  // Accumulate bias
219  if(has_bias)
220  {
221  const auto bias_ptr = reinterpret_cast<T *>(bi.ptr());
222  v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
223  }
224 
225  const auto out_ptr = reinterpret_cast<T *>(out.ptr());
226  wrapper::vstore(out_ptr, v_in);
227 
228  },
229  in, bi, out);
230 }
231 
232 // Quantized case
233 template < typename TOut, bool has_bias, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
234 void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
235  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
236 {
237  using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
238  using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
239 
240  const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
241 
242  const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
243  const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
244 
245  Iterator in(input, window);
246  Iterator out(output, window);
247 
248  execute_window_loop(window, [&](const Coordinates & id)
249  {
250  // Get bias and pointer to input
251  const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
252  int32x4x4_t v_in =
253  {
254  {
255  wrapper::vloadq(in_ptr),
256  wrapper::vloadq(in_ptr + 4),
257  wrapper::vloadq(in_ptr + 8),
258  wrapper::vloadq(in_ptr + 12)
259  }
260  };
261 
262  // Accumulate bias
263  if(has_bias)
264  {
265  const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
266  v_in =
267  {
268  {
269  wrapper::vadd(v_in.val[0], vb),
270  wrapper::vadd(v_in.val[1], vb),
271  wrapper::vadd(v_in.val[2], vb),
272  wrapper::vadd(v_in.val[3], vb)
273  }
274  };
275  }
276 
277  const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
278  wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
279  },
280  in, out);
281 }
282 template < typename TOut, bool has_bias, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
283 void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
284  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
285 {
286  using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
287  using TagType = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
288 
289  const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
290 
291  const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
292  const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
293 
294  Window window_bias = window;
295  window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
296  window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
297  window_bias.set(3, Window::Dimension(0, 0, 0));
298 
299  Iterator in(input, window);
300  Iterator bi(bias, window_bias);
301 
302  Iterator out(output, window);
303  execute_window_loop(window, [&](const Coordinates &)
304  {
305  // Get bias and pointer to input
306  const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
307  int32x4x4_t v_in =
308  {
309  {
310  wrapper::vloadq(in_ptr),
311  wrapper::vloadq(in_ptr + 4),
312  wrapper::vloadq(in_ptr + 8),
313  wrapper::vloadq(in_ptr + 12),
314  }
315  };
316 
317  // Accumulate bias
318  if(has_bias)
319  {
320  const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
321 
322  wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
323  wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
324  wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
325  wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
326  }
327 
328  const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
329  wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
330  },
331  in, bi, out);
332 }
333 } // namespace
334 
336  : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
337 {
338 }
339 
342 {
343  // Perform validation step
345  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info));
346 
347  _func = nullptr;
348  _bias = bias;
349  _input = input;
350  _output = (output != nullptr) ? output : input;
351  _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
352  _result_shift = info.result_shift;
353  _result_offset_after_shift = info.result_offset_after_shift;
354 
355  // Configure kernel window
356  auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info);
357  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
358  INEKernel::configure(win_config.second);
359 
360  const bool has_bias = bias != nullptr;
361  const bool is_qasymm8_signed = (output != nullptr) ? is_data_type_quantized_asymmetric_signed(output->info()->data_type()) : false;
362 
363  // Set appropriate function
364  if(input->info()->data_layout() == DataLayout::NCHW)
365  {
366  switch(input->info()->data_type())
367  {
368  case DataType::S32:
369  {
370  if(is_qasymm8_signed)
371  {
372  _func = (has_bias) ? &output_stage_nchw<int8_t, true> : &output_stage_nchw<int8_t, false>;
373  }
374  else
375  {
376  _func = (has_bias) ? &output_stage_nchw<uint8_t, true> : &output_stage_nchw<uint8_t, false>;
377  }
378  break;
379  }
380 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
381  case DataType::F16:
382  {
383  _func = (has_bias) ? &output_stage_nchw<float16_t, true> : &output_stage_nchw<float16_t, false>;
384  break;
385  }
386 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
387  case DataType::F32:
388  {
389  _func = (has_bias) ? &output_stage_nchw<float, true> : &output_stage_nchw<float, false>;
390  break;
391  }
392  default:
393  {
394  ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
395  }
396  }
397  }
398  else
399  {
400  switch(input->info()->data_type())
401  {
402  case DataType::S32:
403  {
404  if(is_qasymm8_signed)
405  {
406  _func = (has_bias) ? &output_stage_nhwc<int8_t, true> : &output_stage_nhwc<int8_t, false>;
407  }
408  else
409  {
410  _func = (has_bias) ? &output_stage_nhwc<uint8_t, true> : &output_stage_nhwc<uint8_t, false>;
411  }
412  break;
413  }
414 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
415  case DataType::F16:
416  {
417  _func = (has_bias) ? &output_stage_nhwc<float16_t, true> : &output_stage_nhwc<float16_t, false>;
418  break;
419  }
420 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
421  case DataType::F32:
422  {
423  _func = (has_bias) ? &output_stage_nhwc<float, true> : &output_stage_nhwc<float, false>;
424  break;
425  }
426  default:
427  {
428  ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
429  }
430  }
431  }
432 }
433 
436 {
438  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
439  bias == nullptr ? nullptr : bias->clone().get(),
440  output == nullptr ? nullptr : output->clone().get(),
441  info)
442  .first);
443 
444  return Status{};
445 }
446 
448 {
452  ARM_COMPUTE_ERROR_ON(_func == nullptr);
453 
454  (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
455 }
456 } // namespace arm_compute
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:181
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for NEON tensor.
Definition: ITensor.h:36
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps=Steps(), bool skip_border=false, BorderSize border_size=BorderSize())
Calculate the maximum window for a given tensor shape and border setting.
Definition: Helpers.cpp:28
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Copyright (c) 2017-2020 Arm Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:207
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
1 channel, 1 F16 per channel
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
1 channel, 1 S32 per channel
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: Helpers.h:437
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void configure(ITensor *input, const ITensor *bias=nullptr, ITensor *output=nullptr, const DirectConvolutionLayerOutputStageKernelInfo &info=DirectConvolutionLayerOutputStageKernelInfo())
Set the accumulate buffer and the biases of the kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
quantized, asymmetric fixed-point 8-bit number unsigned
bool is_data_type_quantized_asymmetric_signed(DataType dt)
Check if a given data type is of asymmetric quantized signed type.
Definition: Utils.h:1162
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Num samples, channels, height, width.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
static Status validate(const ITensorInfo *input, const ITensorInfo *bias=nullptr, const ITensorInfo *output=nullptr, const DirectConvolutionLayerOutputStageKernelInfo &info=DirectConvolutionLayerOutputStageKernelInfo())
Static function to check if given info will lead to a valid configuration of NEDirectConvolutionLayer...
uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
Performs final quantization step on 16 elements.
Definition: NEAsymm.h:80
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:128
quantized, asymmetric fixed-point 8-bit number signed
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:332
DataType
Available data types.
Definition: Types.h:77
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
Definition: Utils.h:1101
Descriptor used by the direct convolution layer output stage kernels.