35 #include "src/core/NEON/kernels/assembly/depthwise.hpp" 37 #include "depthwise_common.hpp" 53 constexpr
unsigned int idx_channels = 0;
54 constexpr
unsigned int idx_batches = 3;
56 template <
typename TSrc,
typename TWeights,
typename TDst>
57 void create_arm_dwc(
const ITensorInfo *
src,
const ITensorInfo *weights, ITensorInfo *
dst,
58 const ConvolutionInfo &
info,
const CPUInfo &cpu_info,
59 std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel)
61 unsigned int stride_cols{};
62 unsigned int stride_rows{};
63 std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
67 const unsigned int n_batches = src->dimension(idx_batches);
68 const unsigned int src_rows = src->dimension(idx_height);
69 const unsigned int src_cols = src->dimension(idx_width);
70 const unsigned int n_channels = src->dimension(idx_channels);
71 const unsigned int dst_rows = dst->dimension(idx_height);
72 const unsigned int dst_cols = dst->dimension(idx_width);
74 const unsigned int kernel_cols = weights->dimension(idx_width);
75 const unsigned int kernel_rows = weights->dimension(idx_height);
79 arm_conv::depthwise::DepthwiseArgs
args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
80 n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
81 padding, activation,
nullptr);
84 auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(
args);
85 if(dwc_kernel_asm ==
nullptr)
91 kernel = std::move(dwc_kernel_asm);
94 template <
typename TSrc,
typename TWeights,
typename TDst>
95 void create_arm_dwc_quant(
const ITensorInfo *src,
const ITensorInfo *weights, ITensorInfo *dst,
96 const ConvolutionInfo &info,
const CPUInfo &cpu_info,
97 std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
98 std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts)
100 unsigned int stride_cols{};
101 unsigned int stride_rows{};
102 std::tie(stride_cols, stride_rows) = info.pad_stride_info.stride();
106 const unsigned int n_batches = src->dimension(idx_batches);
107 const unsigned int src_rows = src->dimension(idx_height);
108 const unsigned int src_cols = src->dimension(idx_width);
109 const unsigned int n_channels = src->dimension(idx_channels);
110 const unsigned int dst_rows = dst->dimension(idx_height);
111 const unsigned int dst_cols = dst->dimension(idx_width);
113 const unsigned int kernel_cols = weights->dimension(idx_width);
114 const unsigned int kernel_rows = weights->dimension(idx_height);
118 arm_conv::depthwise::DepthwiseArgs
args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
119 n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
120 padding, activation,
nullptr);
122 const auto src_qinfo = src->quantization_info().uniform();
123 const auto weights_qinfo = weights->quantization_info();
124 const auto dst_qinfo = dst->quantization_info().uniform();
126 const unsigned int num_filters = weights_qinfo.scale().size();
128 multipliers.resize(num_filters);
129 std::vector<int32_t> dst_shifts(num_filters);
138 int32_t max_activation = std::numeric_limits<TSrc>::max();
139 if(info.act_info.enabled())
148 left_shifts.resize(num_filters);
149 right_shifts.resize(num_filters);
150 bool need_left_shift =
false;
151 for(
unsigned int i = 0; i < num_filters; ++i)
153 left_shifts[i] = std::max(-dst_shifts[i], static_cast<int32_t>(0));
154 right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
155 if(dst_shifts[i] < 0 && !need_left_shift)
157 need_left_shift =
true;
164 weights_qinfo.uniform().offset,
166 (need_left_shift) ? left_shifts.data() :
nullptr,
169 static_cast<TSrc
>(min_activation),
170 static_cast<TSrc>(max_activation));
177 weights_qinfo.uniform().offset,
181 static_cast<TSrc
>(min_activation),
182 static_cast<TSrc>(max_activation));
186 auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(
args, requant_args);
187 if(dwc_kernel_asm ==
nullptr)
193 kernel = std::move(dwc_kernel_asm);
198 : _kernel_asm(nullptr),
217 #if defined(__aarch64__) 223 create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(
src, weights,
dst,
info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
227 create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(
src, weights,
dst,
info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
231 create_arm_dwc_quant<int8_t, int8_t, int8_t>(
src, weights,
dst,
info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts);
233 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) 235 create_arm_dwc<float16_t, float16_t, float16_t>(
src, weights,
dst,
info, cpu_info, _kernel_asm);
237 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) 239 create_arm_dwc<float, float, float>(
src, weights,
dst,
info, cpu_info, _kernel_asm);
244 #endif // defined(__aarch64__) 247 ICpuKernel::configure(win);
254 #if !defined(__aarch64__) 256 #endif // !defined(__aarch64__) 310 const auto src_ptr = src->
buffer() + src->info()->offset_first_element_in_bytes();
315 const auto src_shape = src->info()->tensor_shape();
317 const auto src_padding = src->info()->padding();
320 const size_t ld_src_col = src_shape[0] + src_padding.
left + src_padding.right;
321 const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
322 const size_t ld_src_batch = ld_src_row * src_shape[2];
323 const size_t ld_dst_col =
dst_shape[0] + dst_padding.left + dst_padding.right;
324 const size_t ld_dst_row = ld_dst_col * (
dst_shape[1] + dst_padding.top + dst_padding.bottom);
325 const size_t ld_dst_batch = ld_dst_row *
dst_shape[2];
327 _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
329 dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
335 _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
340 return _kernel_asm->get_storage_size();
345 return _kernel_asm->get_working_size(num_threads, num_input_channels);
350 return _kernel_asm !=
nullptr;
355 return "CpuDepthwiseConv2dAssemblyWrapperKernel";
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
bool is_configured() const
Was the asm kernel successfully configured?
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
const char * name() const override
Name of the kernel.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
Indicates whether or not this function can be used to process the given parameters.
bool empty() const
Checks if pack is empty.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
size_t get_working_size(unsigned int num_threads, unsigned int num_input_channels) const
Get size of the workspace needed by the assembly kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for CPU tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
size_t get_storage_size() const
Get the amount of storage space required for the rearranged weights and bias.
1 channel, 1 S32 per channel
void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row)
Pack bias and weights in a storage space for the assembly kernel.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
std::pair< int32_t, int32_t > get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info)
Returns a pair of minimum and maximum values for a quantized activation.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Size2D dilation
Dilation, in elements, across x and y.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
virtual PaddingSize padding() const =0
Padding of tensor.
unsigned int left
left of the border
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
quantized, symmetric per channel fixed-point 8-bit number
virtual size_t offset_first_element_in_bytes() const =0
The offset from the beginning of the memory allocation to the first element of the tensor...
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
CpuDepthwiseConv2dAssemblyWrapperKernel()
Default constructor.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
~CpuDepthwiseConv2dAssemblyWrapperKernel()
Class for specifying the size of an image or rectangle.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info)
Initialise the kernel's src and dst.
quantized, asymmetric fixed-point 8-bit number signed
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure...
arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
Performs a mapping between Compute Library PadStrideInfo and the assembly PaddingValues structure...
Describe a multidimensional execution window.
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.