45 bool export_weights_to_cl_image_heuristic(
const ITensorInfo *weights,
unsigned int depth_multiplier,
GPUTarget gpu_target)
54 const size_t kernel_w = weights->tensor_shape()[idx_w];
55 const size_t kernel_h = weights->tensor_shape()[idx_h];
57 if((kernel_w == 1) && (kernel_h == 1))
62 if(depth_multiplier > 1)
75 void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info,
const ITensorInfo *weights,
const PadStrideInfo &
conv_info,
const Size2D &dilation,
unsigned int depth_multiplier,
80 dwc_compute_info.export_weights_to_cl_image =
false;
81 dwc_compute_info.n0 = (depth_multiplier == 1) ? 4 : 1;
82 if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
84 dwc_compute_info.m0 = 2;
88 dwc_compute_info.m0 = 1;
97 dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target);
100 if(depth_multiplier == 1)
102 if(dwc_compute_info.export_weights_to_cl_image ==
false && weights->data_type() ==
DataType::F16)
104 dwc_compute_info.n0 = 8;
108 dwc_compute_info.n0 = 4;
113 dwc_compute_info.n0 = 1;
116 dwc_compute_info.n0 =
adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
119 if(conv_info.stride().first == 1 && dilation.x() == 1)
122 const size_t kernel_w = weights->tensor_shape()[idx_w];
124 dwc_compute_info.m0 = (kernel_w >= 9) || (kernel_w == 1) ? 1 : 2;
128 dwc_compute_info.m0 = 1;
136 : _memory_group(
std::move(memory_manager)),
138 _permute_input_to_nhwc(),
139 _permute_weights_to_nhwc(),
140 _permute_output_to_nchw(),
144 _output_multipliers(),
149 _needs_permute(false),
170 biases !=
nullptr ? biases->
info() :
nullptr,
171 output !=
nullptr ? output->
info() : input->
info(),
179 _is_prepared =
false;
180 _original_weights = weights;
188 const ICLTensor *weights_to_use = weights;
192 _memory_group.
manage(&_permuted_input);
193 _memory_group.
manage(&_permuted_output);
206 input_to_use = &_permuted_input;
207 weights_to_use = &_permuted_weights;
208 output_to_use = &_permuted_output;
211 CLTensor *output_multipliers_to_use =
nullptr;
212 CLTensor *output_shifts_to_use =
nullptr;
221 output_multipliers_to_use = &_output_multipliers;
222 output_shifts_to_use = &_output_shifts;
226 initialize_dwc_native_compute_info(dwc_native_compute_info, weights_to_use->
info(),
conv_info, dilation, depth_multiplier, gpu_target);
230 _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
231 dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
254 const bool in_place = input == output || output ==
nullptr;
302 const TensorInfo permuted_input = input->
clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(
DataLayout::NHWC);
303 const TensorInfo permuted_weights = weights->
clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(
DataLayout::NHWC);
304 const TensorInfo permuted_output = output->
clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(
DataLayout::NHWC);
310 initialize_dwc_native_compute_info(dwc_native_compute_info, &permuted_weights, conv_info, dilation, depth_multiplier, gpu_target);
313 dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
319 initialize_dwc_native_compute_info(dwc_native_compute_info, weights, conv_info, dilation, depth_multiplier, gpu_target);
321 &output_multipliers_shifts_info));
334 _permute_input_to_nhwc.
run();
339 _permute_output_to_nchw.
run();
349 _output_multipliers.
map();
350 _output_shifts.
map();
352 _original_weights->
info(),
353 _output !=
nullptr ? _output->
info() : _input->
info(),
356 _output_multipliers.
unmap();
357 _output_shifts.
unmap();
365 _permute_weights_to_nhwc.
run();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
bool export_weights_to_cl_image(const ITensorInfo *tensor)
TensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
void prepare() override
Prepare the function for executing.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static CLScheduler & get()
Access the scheduler singleton.
GPUTarget target() const
Get the target GPU.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
Strides PermutationVector
Permutation vector.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
CLTensorAllocator * allocator()
Return a pointer to the tensor's allocator.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
size_t x() const
Semantic accessor for width as x.
unsigned int pad_top() const
Get the top padding.
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Activation Layer Information class.
void init(const TensorInfo &input, size_t alignment=0)
Initialize a tensor based on the passed TensorInfo.
Copyright (c) 2017-2022 Arm Limited.
void run() override
Run the kernels contained in the function.
1 channel, 1 F16 per channel
void map(bool blocking=true)
Enqueue a map operation of the allocated buffer.
ITensorInfo & set_quantization_info(const QuantizationInfo &quantization_info) override
Set the quantization settings (scale and offset) of the tensor.
void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination, weights and convolution information.
void permute(Dimensions< T > &dimensions, const PermutationVector &perm)
Permutes given Dimensions according to a permutation vector.
void mark_as_unused() const
Marks a tensor as unused.
1 channel, 1 S32 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
ITensorInfo & set_data_layout(const DataLayout &data_layout) override
Set the data layout of the tensor.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
Padding and stride information class.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
CLDepthwiseConvolutionLayer(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default constructor.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Num samples, channels, height, width.
size_t y() const
Semantic accessor for height as y.
Compute descriptor used by the depthwise convolution native kernel.
quantized, symmetric per channel fixed-point 8-bit number
void allocate() override
Allocate size specified by TensorInfo of OpenCL memory.
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
~CLDepthwiseConvolutionLayer()
Default destructor.
Memory group resources scope handling class.
Interface for OpenCL tensor.
GPUTarget
Available GPU Targets.
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
Class for specifying the size of an image or rectangle.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_LOG_PARAMS(...)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
Set the input and output tensors.
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
Static function to check if given info will lead to a valid configuration of CLPermute.
unsigned int pad_bottom() const
Get the bottom padding.
unsigned int pad_left() const
Get the left padding.
void unmap()
Enqueue an unmap operation of the allocated and mapped buffer.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
Interface for the kernel to run a MxN depthwise convolution.
Basic implementation of the OpenCL tensor interface.