48 Status validate_arguments_3x3(
const ITensorInfo *
input,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *output,
const PadStrideInfo &
conv_info,
49 unsigned int depth_multiplier, ActivationLayerInfo act_info,
GPUTarget gpu_target,
const Size2D &dilation)
58 const bool needs_permute = is_nhwc && (depth_multiplier > 1);
59 const bool needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && is_quantized;
60 const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
61 const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
63 DepthwiseConvolutionReshapeInfo
info;
65 info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
67 TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1,
DataType::S32));
75 output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
85 TensorShape permuted_input_shape = input->tensor_shape();
86 TensorShape permuted_weights_shape = weights->tensor_shape();
93 const TensorInfo permuted_input = input->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(
DataLayout::NCHW);
94 const TensorInfo permuted_weights = weights->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(
DataLayout::NCHW);
95 const TensorInfo permuted_output = output->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(
DataLayout::NCHW);
98 conv_info, depth_multiplier, act_info, gpu_target,
99 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
103 if(needs_weights_reshape)
107 output,
conv_info, depth_multiplier, act_info,
108 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
113 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
119 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
125 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
126 : _memory_group(
std::move(memory_manager)),
127 _dwc_native_kernel(
std::make_unique<CLDepthwiseConvolutionLayerNativeKernel>()),
128 _permute_input_to_nhwc(),
129 _permute_weights_to_nhwc(),
130 _permute_output_to_nchw(),
134 _output_multipliers(),
139 _needs_permute(false),
147 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(ICLTensor *input,
const ICLTensor *weights,
const ICLTensor *biases, ICLTensor *output,
const PadStrideInfo &conv_info,
148 unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
153 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::configure(
const CLCompileContext &compile_context, ICLTensor *input,
const ICLTensor *weights,
const ICLTensor *biases,
154 ICLTensor *output,
const PadStrideInfo &conv_info,
155 unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
160 biases !=
nullptr ? biases->info() :
nullptr,
168 _is_prepared =
false;
169 _original_weights = weights;
174 ICLTensor *input_to_use =
input;
175 const ICLTensor *weights_to_use = weights;
176 ICLTensor *output_to_use = output;
179 _memory_group.manage(&_permuted_input);
180 _memory_group.manage(&_permuted_output);
183 _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input,
PermutationVector(2
U, 0
U, 1
U));
187 _permute_weights_to_nhwc.configure(compile_context, weights, &_permuted_weights,
PermutationVector(2
U, 0
U, 1
U));
191 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
193 input_to_use = &_permuted_input;
194 weights_to_use = &_permuted_weights;
195 output_to_use = &_permuted_output;
198 CLTensor *output_multipliers_to_use =
nullptr;
199 CLTensor *output_shifts_to_use =
nullptr;
205 _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
206 _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
208 output_multipliers_to_use = &_output_multipliers;
209 output_shifts_to_use = &_output_shifts;
212 DWCWeightsKernelInfo dwc_weights_info;
213 dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
214 DWCKernelInfo dwc_info;
215 dwc_info.activation_info = act_info;
216 _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
217 dwc_weights_info, dwc_info, conv_info, depth_multiplier, dilation,
218 output_multipliers_to_use, output_shifts_to_use);
222 _permuted_input.allocator()->allocate();
226 _permute_output_to_nchw.configure(compile_context, &_permuted_output, output,
PermutationVector(1
U, 2
U, 0
U));
227 _permuted_output.allocator()->allocate();
232 _output_multipliers.allocator()->allocate();
233 _output_shifts.allocator()->allocate();
238 const PadStrideInfo &conv_info,
239 unsigned int depth_multiplier,
const ActivationLayerInfo &act_info,
const Size2D &dilation)
245 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
246 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
248 DWCWeightsKernelInfo dwc_weights_info;
249 dwc_weights_info.n0 = (depth_multiplier == 1) ? 8 : 1;
250 DWCKernelInfo dwc_info;
251 dwc_info.activation_info = act_info;
257 TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1
U), 1,
DataType::S32));
265 output_multipliers_shifts_info.set_tensor_shape(TensorShape(weights->dimension(idx_c)));
275 TensorShape permuted_input_shape = input->tensor_shape();
276 TensorShape permuted_weights_shape = weights->tensor_shape();
283 const TensorInfo permuted_input = input->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(
DataLayout::NHWC);
284 const TensorInfo permuted_weights = weights->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(
DataLayout::NHWC);
285 const TensorInfo permuted_output = output->clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(
DataLayout::NHWC);
290 dwc_info, conv_info, depth_multiplier, dilation,
291 &output_multipliers_shifts_info, &output_multipliers_shifts_info));
297 dilation, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
306 MemoryGroupResourceScope scope_mg(_memory_group);
310 _permute_input_to_nhwc.run();
315 _permute_output_to_nchw.run();
319 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerGeneric::prepare()
325 _output_multipliers.map();
326 _output_shifts.map();
329 _original_weights->info(),
332 reinterpret_cast<int32_t *
>(_output_multipliers.ptr_to_element(Coordinates(0))),
333 reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
334 _output_multipliers.unmap();
335 _output_shifts.unmap();
342 _permuted_weights.allocator()->allocate();
343 _permute_weights_to_nhwc.run();
344 _original_weights->mark_as_unused();
350 CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager)
351 : _memory_group(
std::move(memory_manager)),
353 _border_handler(
std::make_unique<CLFillBorderKernel>()),
354 _permute_input_to_nchw(),
355 _permute_weights_to_nchw(),
356 _permute_output_to_nhwc(),
357 _reshape_weights(
std::make_unique<CLDepthwiseConvolutionLayerReshapeWeightsKernel>()),
361 _output_multipliers(),
363 _original_weights(nullptr),
366 _needs_permute(false),
367 _needs_weights_reshape(false),
373 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(ICLTensor *input,
const ICLTensor *weights,
const ICLTensor *biases, ICLTensor *output,
374 const PadStrideInfo &conv_info,
unsigned int depth_multiplier, ActivationLayerInfo act_info,
const Size2D &dilation)
379 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::configure(
const CLCompileContext &compile_context, ICLTensor *input,
const ICLTensor *weights,
const ICLTensor *biases,
381 const PadStrideInfo &conv_info,
unsigned int depth_multiplier, ActivationLayerInfo act_info,
const Size2D &dilation)
389 biases !=
nullptr ? biases->info() :
nullptr,
399 _needs_permute = is_nhwc && (depth_multiplier > 1);
400 _needs_weights_reshape = is_nhwc && (depth_multiplier == 1) && _is_quantized;
402 _is_prepared =
false;
403 _original_weights = weights;
407 ICLTensor *input_to_use =
input;
408 const ICLTensor *weights_to_use = weights;
409 ICLTensor *output_to_use = output;
412 const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
414 const bool is_stride_1_dilation_1 = (is_stride_1 && dilation.x() == 1 && dilation.y() == 1);
416 DepthwiseConvolutionReshapeInfo
info;
418 info.transpose = is_stride_1_dilation_1 && is_dot8_supported;
422 _memory_group.manage(&_permuted_input);
423 _memory_group.manage(&_permuted_output);
426 _permute_input_to_nchw.configure(compile_context, input, &_permuted_input,
PermutationVector(1
U, 2
U, 0
U));
430 _permute_weights_to_nchw.configure(compile_context, weights, &_permuted_weights,
PermutationVector(1
U, 2
U, 0
U));
432 _permuted_output.info()->set_quantization_info(output->info()->quantization_info());
434 input_to_use = &_permuted_input;
435 weights_to_use = &_permuted_weights;
436 output_to_use = &_permuted_output;
438 _kernel = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
442 if(_needs_weights_reshape)
444 _reshape_weights->configure(compile_context, weights, &_permuted_weights, info);
445 weights_to_use = &_permuted_weights;
447 _kernel = std::make_unique<CLDepthwiseConvolutionLayer3x3NHWCKernel>();
451 _kernel = std::make_unique<CLDepthwiseConvolutionLayer3x3NCHWKernel>();
454 CLTensor *output_multipliers_to_use =
nullptr;
455 CLTensor *output_shifts_to_use =
nullptr;
459 const size_t num_filters = (is_quantized_per_channel) ? weights->info()->dimension(idx_c) : 1;
461 _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
462 _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1,
DataType::S32));
464 output_multipliers_to_use = &_output_multipliers;
465 output_shifts_to_use = &_output_shifts;
469 _kernel->set_target(gpu_target);
470 _kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use, conv_info, depth_multiplier,
471 act_info, dilation, output_multipliers_to_use, output_shifts_to_use);
475 _output_multipliers.allocator()->allocate();
476 _output_shifts.allocator()->allocate();
484 _permute_output_to_nhwc.configure(compile_context, &_permuted_output, output,
PermutationVector(2
U, 0
U, 1
U));
487 _permuted_input.allocator()->allocate();
488 _permuted_output.allocator()->allocate();
491 PixelValue &&zero_value(0.f);
494 zero_value = PixelValue(static_cast<uint8_t>(input->info()->quantization_info().uniform().offset));
496 _border_handler->configure(compile_context, input_to_use, _kernel->border_size(),
BorderMode::CONSTANT, zero_value);
500 const PadStrideInfo &conv_info,
unsigned int depth_multiplier, ActivationLayerInfo act_info,
GPUTarget gpu_target,
const Size2D &dilation)
502 return validate_arguments_3x3(input, weights, biases, output, conv_info, depth_multiplier, act_info, gpu_target, dilation);
509 MemoryGroupResourceScope scope_mg(_memory_group);
513 _permute_input_to_nchw.run();
520 _permute_output_to_nhwc.run();
524 void CLDepthwiseConvolutionLayer::CLDepthwiseConvolutionLayerInternal3x3::prepare()
530 _output_multipliers.map();
531 _output_shifts.map();
534 _original_weights->info(),
537 reinterpret_cast<int32_t *
>(_output_multipliers.ptr_to_element(Coordinates(0))),
538 reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
539 _output_multipliers.unmap();
540 _output_shifts.unmap();
547 _permuted_weights.allocator()->allocate();
548 _permute_weights_to_nchw.run();
549 _original_weights->mark_as_unused();
552 if(_needs_weights_reshape)
556 _permuted_weights.allocator()->allocate();
558 _original_weights->mark_as_unused();
577 unsigned int depth_multiplier,
581 _depth_conv_func = get_depthwiseconvolution_function(input->
info(), weights->
info(), (biases !=
nullptr) ? biases->
info() :
nullptr, output->
info(),
conv_info, depth_multiplier, act_info,
582 dilation, gpu_target);
583 switch(_depth_conv_func)
586 _func_3x3.set_memory_group(_memory_manager);
587 _func_3x3.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
591 _func_generic.set_memory_group(_memory_manager);
592 _func_generic.configure(compile_context, input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
604 DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation, gpu_target);
605 switch(depth_conv_func)
633 switch(_depth_conv_func)
648 switch(_depth_conv_func)
654 _func_generic.prepare();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
DepthwiseConvolutionFunction
Available DepthwiseConvolutionFunction.
bool dot8_supported(const cl::Device &device)
Helper function to check whether the cl_arm_integer_dot_product_int8 extension is supported...
void prepare() override
Prepare the function for executing.
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, unsigned int idx_ofms, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), GPUTarget gpu_target=GPUTarget::MIDGARD, const Size2D &dilation=Size2D(1U, 1U), const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
static CLScheduler & get()
Access the scheduler singleton.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
GPUTarget target() const
Get the target GPU.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Strides PermutationVector
Permutation vector.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &input, const DepthwiseConvolutionReshapeInfo &info)
Calculate the reshaped shape of the weights to use in depthwise convolution.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Activation Layer Information class.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
void permute(Dimensions< T > &dimensions, const PermutationVector &perm)
Permutes given Dimensions according to a permutation vector.
1 channel, 1 S32 per channel
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U), const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
Optimized Depthwise Convolution.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
virtual void prepare()
Prepare the function for executing.
quantized, asymmetric fixed-point 8-bit number unsigned
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination, weights and convolution information.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Padding and stride information class.
CLDepthwiseConvolutionLayer(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default constructor.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Num samples, channels, height, width.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
quantized, symmetric per channel fixed-point 8-bit number
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
~CLDepthwiseConvolutionLayer()
Default destructor.
Interface for OpenCL tensor.
GPUTarget
Available GPU Targets.
Class for specifying the size of an image or rectangle.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape)
Set the input and output tensors.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
Static function to check if given info will lead to a valid configuration of CLPermute.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info, const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U), const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...