44 Status validate_arguments_3x3(
const ITensorInfo *
input,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *output,
const PadStrideInfo &
conv_info,
45 unsigned int depth_multiplier, ActivationLayerInfo act_info,
const Size2D &dilation)
54 const bool needs_permute = is_nhwc && (depth_multiplier > 1);
58 TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1,
DataType::S32));
66 output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
76 TensorShape permuted_input_shape =
input->tensor_shape();
77 TensorShape permuted_weights_shape = weights->tensor_shape();
78 const ConvolutionInfo
info{
conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
85 const TensorInfo permuted_input =
input->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(
DataLayout::NCHW);
86 const TensorInfo permuted_weights = weights->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(
DataLayout::NCHW);
87 const TensorInfo permuted_output = output->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(
DataLayout::NCHW);
91 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
101 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
107 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
108 : _memory_group(std::move(memory_manager)),
109 _dwc_native_kernel(std::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()),
110 _permute_input_to_nhwc(),
111 _permute_weights_to_nhwc(),
112 _permute_output_to_nchw(),
116 _output_multipliers(),
121 _needs_permute(false),
129 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *
input,
const ICLTensor *weights,
const ICLTensor *biases, ICLTensor *output,
const PadStrideInfo &
conv_info,
130 unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
135 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
const CLCompileContext &compile_context, ICLTensor *
input,
const ICLTensor *weights,
const ICLTensor *biases,
136 ICLTensor *output,
const PadStrideInfo &
conv_info,
137 unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
142 biases !=
nullptr ? biases->info() :
nullptr,
150 _is_prepared =
false;
151 _original_weights = weights;
156 ICLTensor *input_to_use =
input;
157 const ICLTensor *weights_to_use = weights;
158 ICLTensor *output_to_use = output;
161 _memory_group.manage(&_permuted_input);
162 _memory_group.manage(&_permuted_output);
165 _permute_input_to_nhwc.configure(compile_context,
input, &_permuted_input,
PermutationVector(2U, 0U, 1U));
169 _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights,
PermutationVector(2U, 0U, 1U));
173 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
175 input_to_use = &_permuted_input;
176 weights_to_use = &_permuted_weights;
177 output_to_use = &_permuted_output;
180 CLTensor *output_multipliers_to_use =
nullptr;
181 CLTensor *output_shifts_to_use =
nullptr;
187 _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
188 _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
190 output_multipliers_to_use = &_output_multipliers;
191 output_shifts_to_use = &_output_shifts;
194 DWCWeightsKernelInfo dwc_weights_info;
195 dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
196 DWCKernelInfo dwc_info;
197 dwc_info.activation_info = act_info;
198 _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
199 dwc_weights_info, dwc_info,
conv_info, depth_multiplier, dilation,
200 output_multipliers_to_use, output_shifts_to_use);
204 _permuted_input.allocator()->allocate();
208 _permute_output_to_nchw.configure(compile_context, &_permuted_output, output,
PermutationVector(1U, 2U, 0U));
209 _permuted_output.allocator()->allocate();
214 _output_multipliers.allocator()->allocate();
215 _output_shifts.allocator()->allocate();
221 unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
230 DWCWeightsKernelInfo dwc_weights_info;
231 dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
232 DWCKernelInfo dwc_info;
233 dwc_info.activation_info = act_info;
239 TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1,
DataType::S32));
247 output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
257 TensorShape permuted_input_shape =
input->tensor_shape();
258 TensorShape permuted_weights_shape = weights->tensor_shape();
259 const ConvolutionInfo
info{
conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
266 const TensorInfo permuted_input =
input->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(
DataLayout::NHWC);
267 const TensorInfo permuted_weights = weights->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(
DataLayout::NHWC);
268 const TensorInfo permuted_output = output->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(
DataLayout::NHWC);
273 dwc_info,
conv_info, depth_multiplier, dilation,
274 &output_multipliers_shifts_info, &output_multipliers_shifts_info));
280 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
289 MemoryGroupResourceScope scope_mg(_memory_group);
293 _permute_input_to_nhwc.run();
298 _permute_output_to_nchw.run();
302 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
308 _output_multipliers.map();
309 _output_shifts.map();
310 const unsigned int idx_ofms = _needs_permute ? 2 : 0;
312 _original_weights->info(),
315 reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
316 reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
317 _output_multipliers.unmap();
318 _output_shifts.unmap();
325 _permuted_weights.allocator()->allocate();
326 _permute_weights_to_nhwc.run();
327 _original_weights->mark_as_unused();
333 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
334 : _memory_group(std::move(memory_manager)),
335 _kernel_nchw(nullptr),
336 _kernel_nhwc(nullptr),
337 _border_handler(std::make_unique<CLFillBorderKernel>()),
338 _permute_input_to_nchw(),
339 _permute_weights_to_nchw(),
340 _permute_output_to_nhwc(),
344 _output_multipliers(),
346 _original_weights(nullptr),
349 _needs_permute(false),
351 _is_quantized(false),
356 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *
input,
const ICLTensor *weights,
const ICLTensor *biases, ICLTensor *output,
357 const PadStrideInfo &
conv_info,
unsigned int depth_multiplier, ActivationLayerInfo act_info,
const Size2D &dilation)
362 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(
const CLCompileContext &compile_context, ICLTensor *
input,
const ICLTensor *weights,
const ICLTensor *biases,
364 const PadStrideInfo &
conv_info,
unsigned int depth_multiplier, ActivationLayerInfo act_info,
const Size2D &dilation)
370 biases !=
nullptr ? biases->info() :
nullptr,
379 _needs_permute = _is_nhwc && (depth_multiplier > 1);
381 _is_prepared =
false;
382 _original_weights = weights;
386 ICLTensor *input_to_use =
input;
387 const ICLTensor *weights_to_use = weights;
388 ICLTensor *output_to_use = output;
394 _memory_group.manage(&_permuted_input);
395 _memory_group.manage(&_permuted_output);
398 _permute_input_to_nchw.configure(compile_context,
input, &_permuted_input,
PermutationVector(1U, 2U, 0U));
402 _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights,
PermutationVector(1U, 2U, 0U));
404 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
406 input_to_use = &_permuted_input;
407 weights_to_use = &_permuted_weights;
408 output_to_use = &_permuted_output;
410 _kernel_nchw = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
414 _kernel_nhwc = std::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
418 _kernel_nchw = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
421 CLTensor *output_multipliers_to_use =
nullptr;
422 CLTensor *output_shifts_to_use =
nullptr;
426 const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1;
428 _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
429 _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
431 output_multipliers_to_use = &_output_multipliers;
432 output_shifts_to_use = &_output_shifts;
436 if(_is_nhwc && !_needs_permute)
438 _kernel_nhwc->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
conv_info, depth_multiplier,
443 _kernel_nchw->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
conv_info, depth_multiplier,
444 act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
449 _output_multipliers.allocator()->allocate();
450 _output_shifts.allocator()->allocate();
458 _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output,
PermutationVector(2U, 0U, 1U));
461 _permuted_input.allocator()->allocate();
462 _permuted_output.allocator()->allocate();
465 PixelValue &&zero_value(0.f);
468 zero_value = PixelValue(static_cast<uint8_t>(
input->info()->quantization_info().uniform().offset));
470 if(!_is_nhwc || _needs_permute)
472 _border_handler->configure(compile_context, input_to_use, _kernel_nchw->border_size(),
BorderMode::CONSTANT, zero_value);
477 const PadStrideInfo &
conv_info,
unsigned int depth_multiplier, ActivationLayerInfo act_info,
const Size2D &dilation)
479 return validate_arguments_3x3(
input, weights, biases, output,
conv_info, depth_multiplier, act_info, dilation);
486 MemoryGroupResourceScope scope_mg(_memory_group);
490 _permute_input_to_nchw.run();
493 if(_is_nhwc && !_needs_permute)
504 _permute_output_to_nhwc.run();
508 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare()
514 _output_multipliers.map();
515 _output_shifts.map();
516 const unsigned int idx_ofms = _is_nhwc ? 0 : 2;
518 _original_weights->info(),
521 reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
522 reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
523 _output_multipliers.unmap();
524 _output_shifts.unmap();
531 _permuted_weights.allocator()->allocate();
532 _permute_weights_to_nchw.run();
533 _original_weights->mark_as_unused();
553 unsigned int depth_multiplier,
556 _depth_conv_func = get_depthwiseconvolution_function(
input->info(), weights->
info(), (biases !=
nullptr) ? biases->
info() :
nullptr, output->
info(),
conv_info, depth_multiplier, act_info,
558 switch(_depth_conv_func)
561 _func_3x3.set_memory_group(_memory_manager);
562 _func_3x3.configure(compile_context,
input, weights, biases, output,
conv_info, depth_multiplier, act_info, dilation);
566 _func_generic.set_memory_group(_memory_manager);
567 _func_generic.configure(compile_context,
input, weights, biases, output,
conv_info, depth_multiplier, act_info, dilation);
579 switch(depth_conv_func)
606 switch(_depth_conv_func)
621 switch(_depth_conv_func)
627 _func_generic.prepare();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
DepthwiseConvolutionFunction
Available DepthwiseConvolutionFunction.
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
void prepare() override
Prepare the function for executing.
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, unsigned int idx_ofms, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
static CLScheduler & get()
Access the scheduler singleton.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
1 channel, 1 F32 per channel
Strides PermutationVector
Permutation vector.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Activation Layer Information class.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
void permute(Dimensions< T > &dimensions, const PermutationVector &perm)
Permutes given Dimensions according to a permutation vector.
1 channel, 1 S32 per channel
Optimized Depthwise Convolution.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
quantized, asymmetric fixed-point 8-bit number unsigned
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination, weights and convolution information.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Padding and stride information class.
CLDepthwiseConvolutionLayer(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default constructor.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Num samples, channels, height, width.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
quantized, symmetric per channel fixed-point 8-bit number
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
~CLDepthwiseConvolutionLayer()
Default destructor.
Interface for OpenCL tensor.
Class for specifying the size of an image or rectangle.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
Static function to check if given info will lead to a valid configuration of CLPermute.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U), const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U), const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...