Compute Library
 20.08
NEDepthwiseConvolutionLayer3x3Kernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
26 
30 #include "arm_compute/core/Error.h"
36 #include "arm_compute/core/Types.h"
37 #include "arm_compute/core/Utils.h"
41 
42 namespace arm_compute
43 {
44 namespace
45 {
46 template <typename T1, typename T2, unsigned int stridex>
47 class convolver_3x3
48 {
49 public:
50  static void convolve(const Window &window, unsigned int num_elems_written_per_iteration,
51  const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
52  {
53  const int input_offset = -input->info()->quantization_info().uniform().offset;
54  const int weights_offset = -weights->info()->quantization_info().uniform().offset;
55 
56  const int input_stride_x = input->info()->strides_in_bytes().x();
57  const int input_stride_y = input->info()->strides_in_bytes().y();
58  const int input_stride_z = input->info()->strides_in_bytes().z();
59  const int input_stride_w = input->info()->strides_in_bytes()[3];
60  const int output_stride_y = output->info()->strides_in_bytes().y();
61  const int kernel_stride_y = weights->info()->strides_in_bytes().y();
62  const int kernel_stride_z = weights->info()->strides_in_bytes().z();
63  const int output_w = output->info()->dimension(0);
64  const int output_h = output->info()->dimension(1);
65  const int delta_input = detail::get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
66  const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
67  const unsigned int conv_pad_x = conv_info.pad_left();
68  const unsigned int conv_pad_y = conv_info.pad_top();
69 
70  // setup output window for the iterator
71  Window window_out = window;
72  window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
73  window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
74 
75  // setup input window for the iterator
76  Window window_in = window;
77  // Iteration of input is taken care of in execute_window_loop
78  window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
79  window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
80  window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
81 
82  Window window_k = calculate_max_window(*weights->info(), Steps(1u));
83 
84  Iterator in(input, window_in);
85  Iterator out(output, window_out);
86  Iterator w(weights, window_k);
87 
88  const uint8_t *weights_ptr = w.ptr();
89 
90  execute_window_loop(window_out, [&](const Coordinates & id)
91  {
92  int ih = 0;
93  int oh = 0;
94 
95  const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y + (id.z() / depth_multiplier) * input_stride_z + input_stride_w * id[3];
96  const uint8_t *ptr_weights_base = weights_ptr + id.z() * kernel_stride_z;
97 
98  const auto ptr_weights_r0 = reinterpret_cast<const T1 *>(ptr_weights_base);
99  const auto ptr_weights_r1 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y);
100  const auto ptr_weights_r2 = reinterpret_cast<const T1 *>(ptr_weights_base + kernel_stride_y * 2);
101  const auto vw_r0 = detail::load_matrix_row(ptr_weights_r0, weights_offset);
102  const auto vw_r1 = detail::load_matrix_row(ptr_weights_r1, weights_offset);
103  const auto vw_r2 = detail::load_matrix_row(ptr_weights_r2, weights_offset);
104 
105  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
106  {
107  auto in_top = reinterpret_cast<const T1 *>(input_ptr + (ih + 0) * input_stride_y);
108  auto in_mid = reinterpret_cast<const T1 *>(input_ptr + (ih + dilation.y()) * input_stride_y);
109  auto in_low = reinterpret_cast<const T1 *>(input_ptr + (ih + 2 * dilation.y()) * input_stride_y); // uint8/int8
110  auto p_out = reinterpret_cast<T2 *>(out.ptr() + oh * output_stride_y); // int32
111 
112  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
113  in_top += delta_input, in_mid += delta_input, in_low += delta_input,
114  p_out += num_elems_written_per_iteration)
115  {
116  if(dilation == Size2D(1U, 1U))
117  {
118  detail::convolve_3x3<false>(in_top, in_mid, in_low, p_out, vw_r0, vw_r1, vw_r2, stridex, input_offset);
119  }
120  else
121  {
122  auto vres = detail::convolve_3x3_dilation(in_top, in_mid, in_low, vw_r0, vw_r1, vw_r2, dilation.x(), stridex, input_offset);
123  detail::store_results<stridex>(p_out, vres);
124  }
125  }
126  }
127  },
128  out);
129  }
130 };
131 
132 template <typename T1, typename T2>
133 inline void convolve_3x3(const Window &window, unsigned int num_elems_written_per_iteration,
134  const ITensor *input, const ITensor *weights, ITensor *output,
135  const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
136 {
137  const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
138  switch(conv_stride_x)
139  {
140  case 1:
141  convolver_3x3<T1, T2, 1>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
142  break;
143  case 2:
144  convolver_3x3<T1, T2, 2>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
145  break;
146  case 3:
147  convolver_3x3<T1, T2, 3>::convolve(window, num_elems_written_per_iteration, input, weights, output, conv_info, depth_multiplier, dilation);
148  break;
149  default:
150  ARM_COMPUTE_ERROR("Not implemented");
151  }
152 }
153 
154 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
155 {
159 
160  const DataLayout data_layout = input->data_layout();
163 
164  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != 3 || weights->dimension(height_idx) != 3);
165  ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
166 
167  if(output->total_size() != 0)
168  {
171 
172  if(is_data_type_quantized_asymmetric(input->data_type()))
173  {
174  ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != DataType::S32);
175  }
176  else
177  {
179  }
180  }
181 
182  return Status{};
183 }
184 
185 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
186  const Size2D &dilation)
187 {
188  Window win;
189  bool window_changed = false;
190 
191  // Get convolved dimensions
193  const DataType output_dt = is_data_type_quantized_asymmetric(input->data_type()) ? DataType::S32 : input->data_type();
194 
195  // Output auto inizialitation if not yet initialized
196  auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt).set_quantization_info(output->quantization_info()));
197 
198  // Configure kernel window (generic)
199  const unsigned int conv_stride_x = conv_info.stride().first;
200  const unsigned int conv_stride_y = conv_info.stride().second;
201  const unsigned int conv_pad_top = conv_info.pad_top();
202  const unsigned int conv_pad_left = conv_info.pad_left();
203 
204  unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
205  unsigned int num_elems_read_per_iteration = 0;
206 
207  switch(input->data_type())
208  {
209  case DataType::QASYMM8:
211  num_elems_read_per_iteration = 16 + 15 * (dilation.x() - 1);
212  break;
213 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
214  case DataType::F16:
215  num_elems_written_per_iteration = 32 >> conv_stride_x;
216  num_elems_read_per_iteration = 24 + 23 * (dilation.x() - 1);
217  break;
218 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
219  case DataType::F32:
220  num_elems_read_per_iteration = 12 + 11 * (dilation.x() - 1);
221  break;
222  default:
223  ARM_COMPUTE_ERROR("Data type not supported.");
224  }
225 
226  // Configure kernel window
227  win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
228 
229  AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3 + 2 * (dilation.y() - 1), conv_stride_x, conv_stride_y);
230  AccessWindowStatic weights_access(weights, 0, 0, 3, 3);
231  AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
232 
233  window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
234  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
235 
236  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
237  return std::make_pair(err, win);
238 }
239 } // namespace
240 
242  : _border_size(0), _input(), _output(), _weights(), _conv_info(), _num_elems_written_per_iteration(0), _depth_multiplier(1), _dilation()
243 {
244 }
245 
247 {
248  return _border_size;
249 }
250 
251 void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
252  const Size2D &dilation)
253 {
255  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info, depth_multiplier, dilation));
256 
257  _input = input;
258  _output = output;
259  _weights = weights;
260  _conv_info = conv_info;
261  _depth_multiplier = depth_multiplier;
262  switch(input->info()->data_type())
263  {
264  case DataType::QASYMM8:
266  case DataType::F32:
267  _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
268  break;
269  case DataType::F16:
270  _num_elems_written_per_iteration = 32 >> _conv_info.stride().first;
271  break;
272  default:
273  ARM_COMPUTE_ERROR("Data type not supported.");
274  }
275  _border_size = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
276  _dilation = dilation;
277  auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, dilation);
278  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
279  INEKernel::configure(win_config.second);
280 }
281 
283  const Size2D &dilation)
284 {
286  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, dilation).first);
287  return Status{};
288 }
289 
291 {
294 
296 
297  switch(_input->info()->data_type())
298  {
299 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
300  case DataType::F16:
301  convolve_3x3<float16_t, float16_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
302  break;
303 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
304  case DataType::F32:
305  convolve_3x3<float, float>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
306  break;
307  case DataType::QASYMM8:
308  convolve_3x3<uint8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
309  break;
311  convolve_3x3<int8_t, int32_t>(window, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info, _depth_multiplier, _dilation);
312  break;
313  default:
314  ARM_COMPUTE_ERROR("Not implemented");
315  }
316 }
317 } // namespace arm_compute
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of NEDepthwiseConvolutionLa...
SimpleTensor< float > w
Definition: DFT.cpp:156
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
const DataLayout data_layout
Definition: Im2Col.cpp:146
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
Container for 2D border size.
Definition: Types.h:272
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
1 channel, 1 F32 per channel
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
unsigned int pad_top() const
Get the top padding.
Definition: Types.h:773
Status class.
Definition: Error.h:52
int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for NEON tensor.
Definition: ITensor.h:36
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps=Steps(), bool skip_border=false, BorderSize border_size=BorderSize())
Calculate the maximum window for a given tensor shape and border setting.
Definition: Helpers.cpp:28
Copyright (c) 2017-2020 Arm Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:207
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
1 channel, 1 F16 per channel
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
1 channel, 1 S32 per channel
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: Helpers.h:437
float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation....
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288
quantized, asymmetric fixed-point 8-bit number unsigned
T z() const
Alias to access the size of the third dimension.
Definition: Dimensions.h:91
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
Definition: Types.h:737
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
Definition: Types.h:768
Padding and stride information class.
Definition: Types.h:689
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1143
void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination, conv and border_size.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:128
float32x4x3_t load_matrix_row(const float *ptr)
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:86
quantized, asymmetric fixed-point 8-bit number signed
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:332
unsigned int pad_bottom() const
Get the bottom padding.
Definition: Types.h:778
DataType
Available data types.
Definition: Types.h:77
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:763
DataLayout
[DataLayout enum definition]
Definition: Types.h:120
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
BorderSize border_size() const override
The size of the border for that kernel.