Compute Library
 19.08
NEDepthwiseConvolutionLayerNativeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019 ARM Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
30 
32 
33 namespace arm_compute
34 {
35 namespace
36 {
37 template <typename T, int S, bool has_biases>
38 void depthwise_loop_multiplier1(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
39  const Size2D &dilation, const Window &window)
40 {
41  using VectorType = typename wrapper::traits::neon_vector<T, S>::type;
42  using TagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
43 
44  const size_t input_stride_y = input->info()->strides_in_bytes().y();
45  const size_t input_stride_z = input->info()->strides_in_bytes().z();
46  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
47  input->info()->strides_in_bytes().y();
48  const size_t weights_width = weights->info()->dimension(1);
49  const size_t weights_height = weights->info()->dimension(2);
50  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
51  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
52  const size_t conv_stride_x = conv_info.stride().first;
53  const size_t conv_stride_y = conv_info.stride().second;
54  const size_t conv_pad_left = conv_info.pad_left();
55  const size_t conv_pad_top = conv_info.pad_top();
56 
57  Window win_input = window;
58  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
59  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
60 
61  Window win_weights = win_input;
62  win_weights.set(3, Window::Dimension(0, 0, 0));
63 
64  Iterator input_it(input, win_input);
65  Iterator weights_it(weights, win_weights);
66  Iterator output_it(output, window);
67  Iterator biases_it{};
68 
69  if(has_biases)
70  {
71  biases_it = Iterator(biases, win_weights);
72  }
73 
74  execute_window_loop(window, [&](const Coordinates & id)
75  {
76  VectorType acc = wrapper::vdup_n(static_cast<T>(0), TagType{});
77 
78  const int input_y = id.y() * conv_stride_x - conv_pad_left;
79  const int input_z = id.z() * conv_stride_y - conv_pad_top;
80  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
81 
82  auto weights_ptr = weights_it.ptr();
83  for(size_t h = 0; h < weights_height; ++h)
84  {
85  int offs = input_offset;
86  for(size_t w = 0; w < weights_width; ++w)
87  {
88  const auto input_vals = wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
89  const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * weights_stride_y));
90 
91  acc = wrapper::vmla(acc, weights_vals, input_vals);
92  offs += dilation.x() * input_stride_y;
93  }
94 
95  weights_ptr += weights_stride_z;
96  input_offset += dilation.y() * input_stride_z;
97  }
98 
99  if(has_biases)
100  {
101  const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()));
102  acc = wrapper::vadd(acc, biases_vals);
103  }
104 
105  wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()), acc);
106  },
107  input_it, weights_it, biases_it, output_it);
108 }
109 
110 template <typename T, bool has_biases>
111 void depthwise_loop_generic(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
112  const Size2D &dilation, unsigned int depth_multiplier, const Window &window)
113 {
114  const size_t input_stride_y = input->info()->strides_in_bytes().y();
115  const size_t input_stride_z = input->info()->strides_in_bytes().z();
116  const size_t input_max_offset = input->info()->strides_in_bytes().z() * input->info()->dimension(2) - (input->info()->padding().bottom + input->info()->padding().top) *
117  input->info()->strides_in_bytes().y();
118  const size_t weights_width = weights->info()->dimension(1);
119  const size_t weights_height = weights->info()->dimension(2);
120  const size_t weights_stride_y = weights->info()->strides_in_bytes().y();
121  const size_t weights_stride_z = weights->info()->strides_in_bytes().z();
122  const size_t conv_stride_x = conv_info.stride().first;
123  const size_t conv_stride_y = conv_info.stride().second;
124  const size_t conv_pad_left = conv_info.pad_left();
125  const size_t conv_pad_top = conv_info.pad_top();
126 
127  Window win_input = window;
128  win_input.set(Window::DimY, Window::Dimension(0, 0, 0));
129  win_input.set(Window::DimZ, Window::Dimension(0, 0, 0));
130 
131  Window win_weights = win_input;
132  win_weights.set(3, Window::Dimension(0, 0, 0));
133 
134  win_input.set_dimension_step(Window::DimX, 1);
135 
136  Iterator input_it(input, win_input);
137  Iterator weights_it(weights, win_weights);
138  Iterator output_it(output, window);
139  Iterator biases_it{};
140 
141  if(has_biases)
142  {
143  biases_it = Iterator(biases, win_weights);
144  }
145 
146  execute_window_loop(window, [&](const Coordinates & id)
147  {
148  std::vector<T> acc(depth_multiplier, static_cast<T>(0));
149 
150  const int input_y = id.y() * conv_stride_x - conv_pad_left;
151  const int input_z = id.z() * conv_stride_y - conv_pad_top;
152  int input_offset = input_y * input_stride_y + input_z * input_stride_z;
153 
154  auto weights_ptr = weights_it.ptr();
155  for(size_t h = 0; h < weights_height; ++h)
156  {
157  int offs = input_offset;
158  for(size_t w = 0; w < weights_width; ++w)
159  {
160  const auto input_val = *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), input_max_offset)));
161 
162  for(size_t m = 0; m < depth_multiplier; ++m)
163  {
164  const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * weights_stride_y));
165  acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
166  }
167 
168  offs += dilation.x() * input_stride_y;
169  }
170 
171  weights_ptr += weights_stride_z;
172  input_offset += dilation.y() * input_stride_z;
173  }
174 
175  if(has_biases)
176  {
177  for(size_t m = 0; m < depth_multiplier; ++m)
178  {
179  const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
180  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
181  }
182  }
183  else
184  {
185  for(size_t m = 0; m < depth_multiplier; ++m)
186  {
187  *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
188  }
189  }
190  },
191  input_it, weights_it, biases_it, output_it);
192 }
193 
194 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
195  const Size2D &dilation)
196 {
200  ARM_COMPUTE_RETURN_ERROR_ON(depth_multiplier == 0);
201  ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * depth_multiplier) != weights->dimension(0));
202  ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
203  ARM_COMPUTE_RETURN_ERROR_ON((conv_info.stride().first < 1) || (conv_info.stride().second < 1));
204 
205  if(biases != nullptr)
206  {
207  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
208  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
209  }
210 
211  if(output->total_size() != 0)
212  {
215  }
216 
217  return Status{};
218 }
219 } // namespace
220 
221 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases,
222  ITensorInfo *output, const PadStrideInfo &conv_info,
223  unsigned int depth_multiplier, const Size2D &dilation)
224 {
225  // Get convolved dimensions
227 
228  // Output auto inizialitation if not yet initialized
229  auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
230 
231  // Configure kernel window (generic)
232  const unsigned int num_elems_read_per_iteration = (depth_multiplier == 1) ? 8 / element_size_from_data_type(input->data_type()) : 1;
233  const unsigned int num_elems_written_per_iteration = num_elems_read_per_iteration * depth_multiplier;
234 
235  // Configure kernel window
236  Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
237 
238  AccessWindowStatic input_access(input, 0, -conv_info.pad_left(), ceil_to_multiple(num_elems_read_per_iteration, input->dimension(0)),
239  input->dimension(1) + std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()));
240  AccessWindowHorizontal weights_access(weights, 0, num_elems_written_per_iteration);
241  AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
242 
243  bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
244 
245  if(biases != nullptr)
246  {
247  AccessWindowHorizontal biases_access(biases, 0, num_elems_written_per_iteration);
248  window_changed |= update_window_and_padding(win, biases_access);
249  }
250 
251  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
252 
253  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
254  return std::make_pair(err, win);
255 }
256 
258  : _func(), _border_size(0), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation()
259 {
260 }
261 
263 {
264  return _border_size;
265 }
266 
268  const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
269 {
270  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
271  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, depth_multiplier, dilation));
272 
273  _input = input;
274  _weights = weights;
275  _biases = biases;
276  _output = output;
277  _conv_info = conv_info;
278  _depth_multiplier = depth_multiplier;
279  _border_size = BorderSize(_conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
280  _dilation = dilation;
281 
282  switch(_input->info()->data_type())
283  {
284  case DataType::F32:
285  _func = (biases != nullptr) ? &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, 2, true> : &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, 2, false>;
286  break;
287  default:
288  ARM_COMPUTE_ERROR("Data type not supported");
289  break;
290  }
291 
292  auto win_config = validate_and_configure_window(_input->info(), _weights->info(), (biases != nullptr) ? biases->info() : nullptr, _output->info(), _conv_info, _depth_multiplier, dilation);
293  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
294  INEKernel::configure(win_config.second);
295 }
296 
298  unsigned int depth_multiplier,
299  const Size2D &dilation)
300 {
301  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info, depth_multiplier, dilation));
302  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), (biases != nullptr) ? biases->clone().get() : nullptr, output->clone().get(), conv_info,
303  depth_multiplier, dilation)
304  .first);
305  return Status{};
306 }
307 
309 {
313 
314  (this->*_func)(window);
315 }
316 
317 template <typename T, int S, bool has_biases>
318 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(const Window &window)
319 {
322 
323  if(_depth_multiplier == 1)
324  {
325  depthwise_loop_multiplier1<T, S, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, window);
326  }
327  else
328  {
329  depthwise_loop_generic<T, has_biases>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, window);
330  }
331 }
332 } // namespace arm_compute
#define ARM_COMPUTE_ERROR(...)
Print the given message then throw an std::runtime_error.
Definition: Error.h:261
SimpleTensor< float > w
Definition: DFT.cpp:156
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
TensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: CLTensor.cpp:35
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
Definition: Types.h:259
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
std::pair< Status, Window > validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, const Size2D &dilation)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:193
size_t dimension(size_t index) const override
Return the size of the requested dimension.
Definition: TensorInfo.h:223
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:184
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:791
1 channel, 1 F32 per channel
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:327
const Strides & strides_in_bytes() const override
The strides in bytes for accessing each dimension of the tensor.
Definition: TensorInfo.h:231
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_CREATE_ERROR(error_code,...)
Creates an error with a given message.
Definition: Error.h:167
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:244
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Interface for NEON tensor.
Definition: ITensor.h:36
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps=Steps(), bool skip_border=false, BorderSize border_size=BorderSize())
Calculate the maximum window for a given tensor shape and border setting.
Definition: Helpers.cpp:28
Copyright (c) 2017-2018 ARM Limited.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: Helpers.inl:201
Implementation of a static rectangular access pattern.
BorderSize border_size() const override
The size of the border for that kernel.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: Helpers.h:402
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:160
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:66
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
T z() const
Alias to access the size of the third dimension.
Definition: Dimensions.h:91
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Padding and stride information class.
Definition: Types.h:676
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Information about executing thread and CPU.
Definition: CPPTypes.h:225
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:122
T y() const
Alias to access the size of the second dimension.
Definition: Dimensions.h:86
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
Container for valid region of a window.
Definition: Types.h:174
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination and parameters.
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:750
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:940
void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined=false, const BorderSize &border_size=BorderSize(0))
Set the valid region based on access pattern, valid region of the inputs and border mode.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of NEDepthwiseConvolutionLa...