Compute Library
 21.11
CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Utils.h"
30 #include "src/core/CPP/Validate.h"
34 
35 #include "src/core/NEON/kernels/assembly/depthwise.hpp"
36 
37 #include "depthwise_common.hpp"
38 
39 #include <arm_neon.h>
40 
41 namespace arm_compute
42 {
43 namespace cpu
44 {
45 namespace kernels
46 {
48 
49 namespace
50 {
51 constexpr unsigned int idx_width = 1;
52 constexpr unsigned int idx_height = 2;
53 constexpr unsigned int idx_channels = 0;
54 constexpr unsigned int idx_batches = 3;
55 
56 template <typename TSrc, typename TWeights, typename TDst>
57 void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
58  const ConvolutionInfo &info, const CPUInfo &cpu_info,
59  std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel)
60 {
61  unsigned int stride_cols{};
62  unsigned int stride_rows{};
63  std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
64 
65  const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
66 
67  const unsigned int n_batches = src->dimension(idx_batches);
68  const unsigned int src_rows = src->dimension(idx_height);
69  const unsigned int src_cols = src->dimension(idx_width);
70  const unsigned int n_channels = src->dimension(idx_channels);
71  const unsigned int dst_rows = dst->dimension(idx_height);
72  const unsigned int dst_cols = dst->dimension(idx_width);
73 
74  const unsigned int kernel_cols = weights->dimension(idx_width);
75  const unsigned int kernel_rows = weights->dimension(idx_height);
76 
77  const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
78 
79  arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
80  n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
81  padding, activation, nullptr);
82 
83  // Configure assembly pooling kernel
84  auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
85  if(dwc_kernel_asm == nullptr)
86  {
87  // Configuration not supported: Leave function unconfigured:
88  return;
89  }
90 
91  kernel = std::move(dwc_kernel_asm);
92 }
93 
94 template <typename TSrc, typename TWeights, typename TDst>
95 void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
96  const ConvolutionInfo &info, const CPUInfo &cpu_info,
97  std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
98  std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts)
99 {
100  unsigned int stride_cols{};
101  unsigned int stride_rows{};
102  std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
103 
104  const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
105 
106  const unsigned int n_batches = src->dimension(idx_batches);
107  const unsigned int src_rows = src->dimension(idx_height);
108  const unsigned int src_cols = src->dimension(idx_width);
109  const unsigned int n_channels = src->dimension(idx_channels);
110  const unsigned int dst_rows = dst->dimension(idx_height);
111  const unsigned int dst_cols = dst->dimension(idx_width);
112 
113  const unsigned int kernel_cols = weights->dimension(idx_width);
114  const unsigned int kernel_rows = weights->dimension(idx_height);
115 
116  const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
117 
118  arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
119  n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
120  padding, activation, nullptr);
121 
122  const auto src_qinfo = src->quantization_info().uniform();
123  const auto weights_qinfo = weights->quantization_info();
124  const auto dst_qinfo = dst->quantization_info().uniform();
125 
126  const unsigned int num_filters = weights_qinfo.scale().size();
127 
128  multipliers.resize(num_filters);
129  std::vector<int32_t> dst_shifts(num_filters);
131  weights,
132  dst,
133  multipliers.data(),
134  dst_shifts.data());
135 
136  // Quantize activation bounds
137  int32_t min_activation = std::numeric_limits<TSrc>::lowest();
138  int32_t max_activation = std::numeric_limits<TSrc>::max();
139  if(info.act_info.enabled())
140  {
141  std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
142  }
143 
144  // Set quantization parameters for assembly kernels
145  arm_gemm::Requantize32 requant_args{};
146  if(is_data_type_quantized_per_channel(weights->data_type()))
147  {
148  left_shifts.resize(num_filters);
149  right_shifts.resize(num_filters);
150  bool need_left_shift = false; // Select more optimized path if left shift is not needed
151  for(unsigned int i = 0; i < num_filters; ++i)
152  {
153  left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0));
154  right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
155  if(dst_shifts[i] < 0 && !need_left_shift)
156  {
157  need_left_shift = true;
158  }
159  }
160 
161  requant_args = arm_gemm::Requantize32(nullptr,
162  0,
163  src_qinfo.offset,
164  weights_qinfo.uniform().offset,
165  dst_qinfo.offset,
166  (need_left_shift) ? left_shifts.data() : nullptr,
167  right_shifts.data(),
168  multipliers.data(),
169  static_cast<TSrc>(min_activation),
170  static_cast<TSrc>(max_activation));
171  }
172  else
173  {
174  requant_args = arm_gemm::Requantize32(nullptr,
175  0,
176  src_qinfo.offset,
177  weights_qinfo.uniform().offset,
178  dst_qinfo.offset,
179  -dst_shifts[0],
180  multipliers[0],
181  static_cast<TSrc>(min_activation),
182  static_cast<TSrc>(max_activation));
183  }
184 
185  // Configure assembly pooling kernel with requantization
186  auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
187  if(dwc_kernel_asm == nullptr)
188  {
189  // Configuration not supported: Leave function unconfigured:
190  return;
191  }
192 
193  kernel = std::move(dwc_kernel_asm);
194 }
195 } // namespace
196 
198  : _kernel_asm(nullptr),
199  _multipliers(),
200  _left_shifts(),
201  _right_shifts()
202 {
203 }
204 
206 
208  const ConvolutionInfo &info, const CPUInfo &cpu_info)
209 {
210  ARM_COMPUTE_UNUSED(cpu_info);
211  ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
212 
213  // Destination initialization if not yet initialized
214  const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
215  auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
216 
217 #if defined(__aarch64__)
218  switch(src->data_type())
219  {
220  case DataType::QASYMM8:
222  {
223  create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
224  }
225  else
226  {
227  create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
228  }
229  break;
231  create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
232  break;
233 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
234  case DataType::F16:
235  create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm);
236  break;
237 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
238  case DataType::F32:
239  create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm);
240  break;
241  default:
242  break;
243  }
244 #endif // defined(__aarch64__)
245 
246  Window win = calculate_max_window(*dst, Steps());
247  ICpuKernel::configure(win);
248 }
249 
251 {
253 
254 #if !defined(__aarch64__)
255  ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
256 #endif // !defined(__aarch64__)
259  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
260  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)");
261 
263  {
265  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
266  }
267  else
268  {
270  }
271 
272  if(bias != nullptr)
273  {
275  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
276 
278  {
280  }
281  else
282  {
284  }
285  }
286 
287  if(dst->total_size() > 0)
288  {
292  }
293  return Status{};
294 }
295 
297 {
298  ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
300  ARM_COMPUTE_UNUSED(window);
301  ARM_COMPUTE_UNUSED(info);
302 
303  ARM_COMPUTE_ERROR_ON(tensors.empty());
304 
305  const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
306  ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
307  ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
308  ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1);
309 
310  const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
311  auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
312  auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
313  auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
314 
315  const auto src_shape = src->info()->tensor_shape();
316  const auto dst_shape = dst->info()->tensor_shape();
317  const auto src_padding = src->info()->padding();
318  const auto dst_padding = dst->info()->padding();
319 
320  const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
321  const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
322  const size_t ld_src_batch = ld_src_row * src_shape[2];
323  const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
324  const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
325  const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
326 
327  _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
328  parameters_ptr,
329  dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
330  working_space, info.thread_id, info.num_threads);
331 }
332 
333 void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
334 {
335  _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
336 }
337 
339 {
340  return _kernel_asm->get_storage_size();
341 }
342 
343 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
344 {
345  return _kernel_asm->get_working_size(num_threads, num_input_channels);
346 }
347 
349 {
350  return _kernel_asm != nullptr;
351 }
352 
354 {
355  return "CpuDepthwiseConv2dAssemblyWrapperKernel";
356 }
357 
358 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
359 {
360  ARM_COMPUTE_UNUSED(platform, thread_count);
361 
363 }
364 } // namespace kernels
365 } // namespace cpu
366 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:981
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
Indicates whether or not this function can be used to process the given parameters.
static constexpr size_t small_network_mws
Definition: ICPPKernel.h:42
bool empty() const
Checks if pack is empty.
Definition: ITensorPack.cpp:80
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
Get size of the workspace needed by the assembly kernel.
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for CPU tensor.
Definition: ITensor.h:36
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:284
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:159
size_t get_storage_size() const
Get the amount of storage space required for the rearranged weights and bias.
1 channel, 1 S32 per channel
void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row)
Pack bias and weights in a storage space for the assembly kernel.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1058
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
std::pair< int32_t, int32_t > get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
Returns a pair of minimum and maximum values for a quantized activation.
Definition: Utils.cpp:488
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Size2D dilation
Dilation, in elements, across x and y.
Definition: Types.h:1909
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
const std::vector< float > & scale() const
Scale vector accessor.
virtual PaddingSize padding() const =0
Padding of tensor.
unsigned int left
left of the border
Definition: Types.h:380
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
quantized, symmetric per channel fixed-point 8-bit number
virtual size_t offset_first_element_in_bytes() const =0
The offset from the beginning of the memory allocation to the first element of the tensor...
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
size_t get_mws(const CPUInfo &platform, size_t thread_count) const override
Return minimum workload size of the relevant kernel.
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Definition: Error.h:194
Information about executing thread and CPU.
Definition: CPPTypes.h:158
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info)
Initialise the kernel&#39;s src and dst.
quantized, asymmetric fixed-point 8-bit number signed
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure...
arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
Performs a mapping between Compute Library PadStrideInfo and the assembly PaddingValues structure...
Describe a multidimensional execution window.
Definition: Window.h:39
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.