Compute Library
 22.11
CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Utils.h"
30 #include "src/core/CPP/Validate.h"
34 
35 #include "src/core/NEON/kernels/assembly/depthwise.hpp"
36 
37 #include "depthwise_common.hpp"
38 
39 #include <arm_neon.h>
40 
41 namespace arm_compute
42 {
43 namespace cpu
44 {
45 namespace kernels
46 {
48 
49 namespace
50 {
51 constexpr unsigned int idx_width = 1;
52 constexpr unsigned int idx_height = 2;
53 constexpr unsigned int idx_channels = 0;
54 constexpr unsigned int idx_batches = 3;
55 
56 template <typename TSrc, typename TWeights, typename TDst>
57 void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
58  const ConvolutionInfo &info, const CPUInfo &cpu_info,
59  std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
60 {
61  unsigned int stride_cols{};
62  unsigned int stride_rows{};
63  std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
64 
65  const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
66 
67  const unsigned int n_batches = src->dimension(idx_batches);
68  const unsigned int src_rows = src->dimension(idx_height);
69  const unsigned int src_cols = src->dimension(idx_width);
70  const unsigned int n_channels = src->dimension(idx_channels);
71  const unsigned int dst_rows = dst->dimension(idx_height);
72  const unsigned int dst_cols = dst->dimension(idx_width);
73 
74  const unsigned int kernel_cols = weights->dimension(idx_width);
75  const unsigned int kernel_rows = weights->dimension(idx_height);
76 
77  const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
78 
79  arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
80  n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
81  padding, activation, nullptr);
82 
83  // Configure assembly pooling kernel
84  auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
85  if(dwc_kernel_asm == nullptr)
86  {
87  // Configuration not supported: Leave function unconfigured:
88  return;
89  }
90 
91  _name = dwc_kernel_asm->name();
92  kernel = std::move(dwc_kernel_asm);
93 }
94 
95 template <typename TSrc, typename TWeights, typename TDst>
96 void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
97  const ConvolutionInfo &info, const CPUInfo &cpu_info,
98  std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
99  std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
100  std::string &_name)
101 {
102  unsigned int stride_cols{};
103  unsigned int stride_rows{};
104  std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
105 
106  const arm_conv::PaddingValues padding = assembly_utils::map_to_arm_conv_padding(info.pad_stride_info);
107 
108  const unsigned int n_batches = src->dimension(idx_batches);
109  const unsigned int src_rows = src->dimension(idx_height);
110  const unsigned int src_cols = src->dimension(idx_width);
111  const unsigned int n_channels = src->dimension(idx_channels);
112  const unsigned int dst_rows = dst->dimension(idx_height);
113  const unsigned int dst_cols = dst->dimension(idx_width);
114 
115  const unsigned int kernel_cols = weights->dimension(idx_width);
116  const unsigned int kernel_rows = weights->dimension(idx_height);
117 
118  const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
119 
120  arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
121  n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
122  padding, activation, nullptr);
123 
124  const auto src_qinfo = src->quantization_info().uniform();
125  const auto weights_qinfo = weights->quantization_info();
126  const auto dst_qinfo = dst->quantization_info().uniform();
127 
128  const unsigned int num_filters = weights_qinfo.scale().size();
129 
130  multipliers.resize(num_filters);
131  std::vector<int32_t> dst_shifts(num_filters);
133  weights,
134  dst,
135  multipliers.data(),
136  dst_shifts.data());
137 
138  // Quantize activation bounds
139  int32_t min_activation = std::numeric_limits<TSrc>::lowest();
140  int32_t max_activation = std::numeric_limits<TSrc>::max();
141  if(info.act_info.enabled())
142  {
143  std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
144  }
145 
146  // Set quantization parameters for assembly kernels
147  arm_gemm::Requantize32 requant_args{};
148  if(is_data_type_quantized_per_channel(weights->data_type()))
149  {
150  left_shifts.resize(num_filters);
151  right_shifts.resize(num_filters);
152  bool need_left_shift = false; // Select more optimized path if left shift is not needed
153  for(unsigned int i = 0; i < num_filters; ++i)
154  {
155  left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0));
156  right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
157  if(dst_shifts[i] < 0 && !need_left_shift)
158  {
159  need_left_shift = true;
160  }
161  }
162 
163  requant_args = arm_gemm::Requantize32(nullptr,
164  0,
165  src_qinfo.offset,
166  weights_qinfo.uniform().offset,
167  dst_qinfo.offset,
168  (need_left_shift) ? left_shifts.data() : nullptr,
169  right_shifts.data(),
170  multipliers.data(),
171  static_cast<TSrc>(min_activation),
172  static_cast<TSrc>(max_activation));
173  }
174  else
175  {
176  requant_args = arm_gemm::Requantize32(nullptr,
177  0,
178  src_qinfo.offset,
179  weights_qinfo.uniform().offset,
180  dst_qinfo.offset,
181  -dst_shifts[0],
182  multipliers[0],
183  static_cast<TSrc>(min_activation),
184  static_cast<TSrc>(max_activation));
185  }
186 
187  // Configure assembly pooling kernel with requantization
188  auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
189  if(dwc_kernel_asm == nullptr)
190  {
191  // Configuration not supported: Leave function unconfigured:
192  return;
193  }
194  _name = dwc_kernel_asm->name();
195  kernel = std::move(dwc_kernel_asm);
196 }
197 } // namespace
198 
200  : _kernel_asm(nullptr),
201  _multipliers(),
202  _left_shifts(),
203  _right_shifts(),
204  _name()
205 {
206 }
207 
209 
211  const ConvolutionInfo &info, const CPUInfo &cpu_info)
212 {
213  ARM_COMPUTE_UNUSED(cpu_info);
214  ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
215 
216  // Destination initialization if not yet initialized
217  const TensorShape dst_shape = compute_depthwise_convolution_shape(*src, *weights, info);
218  auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
219  _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
220  std::string asm_kernel_name("");
221 #if defined(__aarch64__)
222  switch(src->data_type())
223  {
224  case DataType::QASYMM8:
226  {
227  create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
228  }
229  else
230  {
231  create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
232  }
233  break;
235  create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
236  break;
237 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
238  case DataType::F16:
239  create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
240  break;
241 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
242  case DataType::F32:
243  create_arm_dwc<float, float, float>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
244  break;
245  default:
246  break;
247  }
248 #endif // defined(__aarch64__)
249 
250  Window win = calculate_max_window(*dst, Steps());
251  ICpuKernel::configure(win);
252  if(_kernel_asm != nullptr)
253  {
254  _name += "/" + asm_kernel_name;
255  }
256 }
257 
259 {
261 
262 #if !defined(__aarch64__)
263  ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
264 #endif // !defined(__aarch64__)
267  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
268  ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.dilation != Size2D(1, 1), "Assembly kernels do not support dilation != (1, 1)");
269 
271  {
273  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
274  }
275  else
276  {
278  }
279 
280  if(bias != nullptr)
281  {
283  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
284 
286  {
288  }
289  else
290  {
292  }
293  }
294 
295  if(dst->total_size() > 0)
296  {
300  }
301  return Status{};
302 }
303 
305 {
306  ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
308  ARM_COMPUTE_UNUSED(window);
309  ARM_COMPUTE_UNUSED(info);
310 
311  ARM_COMPUTE_ERROR_ON(tensors.empty());
312 
313  const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
314  ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
315  ITensor *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
316  ITensor *storage = tensors.get_tensor(TensorType::ACL_INT_1);
317 
318  const auto src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
319  auto dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
320  auto working_space = workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
321  auto parameters_ptr = storage->buffer() + storage->info()->offset_first_element_in_bytes();
322 
323  const auto src_shape = src->info()->tensor_shape();
324  const auto dst_shape = dst->info()->tensor_shape();
325  const auto src_padding = src->info()->padding();
326  const auto dst_padding = dst->info()->padding();
327 
328  const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
329  const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
330  const size_t ld_src_batch = ld_src_row * src_shape[2];
331  const size_t ld_dst_col = dst_shape[0] + dst_padding.left + dst_padding.right;
332  const size_t ld_dst_row = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
333  const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
334 
335  _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
336  parameters_ptr,
337  dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
338  working_space, info.thread_id, info.num_threads);
339 }
340 
341 void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
342 {
343  _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
344 }
345 
347 {
348  return _kernel_asm->get_storage_size();
349 }
350 
351 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
352 {
353  return _kernel_asm->get_working_size(num_threads, num_input_channels);
354 }
355 
357 {
358  return _kernel_asm != nullptr;
359 }
360 
362 {
363  return _name.c_str();
364 }
365 
366 size_t CpuDepthwiseConv2dAssemblyWrapperKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
367 {
368  ARM_COMPUTE_UNUSED(thread_count);
369  ARM_COMPUTE_UNUSED(platform);
370 
372 }
373 } // namespace kernels
374 } // namespace cpu
375 } // namespace arm_compute
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1030
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
Indicates whether or not this function can be used to process the given parameters.
bool empty() const
Checks if pack is empty.
Definition: ITensorPack.cpp:80
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
Get size of the workspace needed by the assembly kernel.
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for CPU tensor.
Definition: ITensor.h:36
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:284
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2022 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:159
size_t get_storage_size() const
Get the amount of storage space required for the rearranged weights and bias.
1 channel, 1 S32 per channel
void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row)
Pack bias and weights in a storage space for the assembly kernel.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Definition: Utils.h:1107
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
std::pair< int32_t, int32_t > get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
Returns a pair of minimum and maximum values for a quantized activation.
Definition: Utils.cpp:558
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Size2D dilation
Dilation, in elements, across x and y.
Definition: Types.h:2274
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
const std::vector< float > & scale() const
Scale vector accessor.
virtual PaddingSize padding() const =0
Padding of tensor.
unsigned int left
left of the border
Definition: Types.h:393
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
quantized, symmetric per channel fixed-point 8-bit number
virtual size_t offset_first_element_in_bytes() const =0
The offset from the beginning of the memory allocation to the first element of the tensor...
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
size_t get_mws(const CPUInfo &platform, size_t thread_count) const override
Return minimum workload size of the relevant kernel.
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Definition: Error.h:194
Information about executing thread and CPU.
Definition: CPPTypes.h:179
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info)
Initialise the kernel&#39;s src and dst.
quantized, asymmetric fixed-point 8-bit number signed
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure...
arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
Performs a mapping between Compute Library PadStrideInfo and the assembly PaddingValues structure...
Describe a multidimensional execution window.
Definition: Window.h:39
static constexpr size_t default_mws
Definition: ICPPKernel.h:41
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
const int32_t * bias