43 bool export_weights_to_cl_image_heuristic(
const ITensorInfo *weights,
unsigned int depth_multiplier,
GPUTarget gpu_target)
52 const size_t kernel_w = weights->tensor_shape()[idx_w];
53 const size_t kernel_h = weights->tensor_shape()[idx_h];
55 if((kernel_w == 1) && (kernel_h == 1))
60 if(depth_multiplier > 1)
73 void initialize_dwc_native_compute_info(DWCComputeKernelInfo &dwc_compute_info,
const ITensorInfo *weights,
const PadStrideInfo &
conv_info,
const Size2D &dilation,
unsigned int depth_multiplier,
78 dwc_compute_info.export_weights_to_cl_image =
false;
79 dwc_compute_info.n0 = (depth_multiplier == 1) ? 4 : 1;
80 if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
82 dwc_compute_info.m0 = 2;
86 dwc_compute_info.m0 = 1;
95 dwc_compute_info.export_weights_to_cl_image = export_weights_to_cl_image_heuristic(weights, depth_multiplier, gpu_target);
98 if(depth_multiplier == 1)
100 if(dwc_compute_info.export_weights_to_cl_image ==
false && weights->data_type() ==
DataType::F16)
102 dwc_compute_info.n0 = 8;
106 dwc_compute_info.n0 = 4;
111 dwc_compute_info.n0 = 1;
114 dwc_compute_info.n0 =
adjust_vec_size(dwc_compute_info.n0, weights->dimension(0));
117 if(conv_info.stride().first == 1 && dilation.x() == 1)
120 const size_t kernel_w = weights->tensor_shape()[idx_w];
122 dwc_compute_info.m0 = (kernel_w >= 9) || (kernel_w == 1) ? 1 : 2;
126 dwc_compute_info.m0 = 1;
134 : _memory_group(
std::move(memory_manager)),
136 _permute_input_to_nhwc(),
137 _permute_weights_to_nhwc(),
138 _permute_output_to_nchw(),
142 _output_multipliers(),
147 _needs_permute(false),
168 biases !=
nullptr ? biases->
info() :
nullptr,
169 output !=
nullptr ? output->
info() : input->
info(),
176 _is_prepared =
false;
177 _original_weights = weights;
185 const ICLTensor *weights_to_use = weights;
189 _memory_group.
manage(&_permuted_input);
190 _memory_group.
manage(&_permuted_output);
203 input_to_use = &_permuted_input;
204 weights_to_use = &_permuted_weights;
205 output_to_use = &_permuted_output;
208 CLTensor *output_multipliers_to_use =
nullptr;
209 CLTensor *output_shifts_to_use =
nullptr;
218 output_multipliers_to_use = &_output_multipliers;
219 output_shifts_to_use = &_output_shifts;
223 initialize_dwc_native_compute_info(dwc_native_compute_info, weights_to_use->
info(),
conv_info, dilation, depth_multiplier, gpu_target);
227 _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
228 dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
251 const bool in_place = input == output || output ==
nullptr;
299 const TensorInfo permuted_input = input->
clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(
DataLayout::NHWC);
300 const TensorInfo permuted_weights = weights->
clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(
DataLayout::NHWC);
301 const TensorInfo permuted_output = output->
clone()->set_is_resizable(
true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(
DataLayout::NHWC);
307 initialize_dwc_native_compute_info(dwc_native_compute_info, &permuted_weights, conv_info, dilation, depth_multiplier, gpu_target);
310 dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
316 initialize_dwc_native_compute_info(dwc_native_compute_info, weights, conv_info, dilation, depth_multiplier, gpu_target);
318 &output_multipliers_shifts_info));
331 _permute_input_to_nhwc.
run();
336 _permute_output_to_nchw.
run();
346 _output_multipliers.
map();
347 _output_shifts.
map();
349 _original_weights->
info(),
350 _output !=
nullptr ? _output->
info() : _input->
info(),
353 _output_multipliers.
unmap();
354 _output_shifts.
unmap();
362 _permute_weights_to_nhwc.
run();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
bool export_weights_to_cl_image(const ITensorInfo *tensor)
TensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
void prepare() override
Prepare the function for executing.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static CLScheduler & get()
Access the scheduler singleton.
GPUTarget target() const
Get the target GPU.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
Strides PermutationVector
Permutation vector.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
CLTensorAllocator * allocator()
Return a pointer to the tensor's allocator.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
size_t x() const
Semantic accessor for width as x.
unsigned int pad_top() const
Get the top padding.
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Activation Layer Information class.
void init(const TensorInfo &input, size_t alignment=0)
Initialize a tensor based on the passed TensorInfo.
Copyright (c) 2017-2021 Arm Limited.
void run() override
Run the kernels contained in the function.
1 channel, 1 F16 per channel
void map(bool blocking=true)
Enqueue a map operation of the allocated buffer.
ITensorInfo & set_quantization_info(const QuantizationInfo &quantization_info) override
Set the quantization settings (scale and offset) of the tensor.
void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination, weights and convolution information.
void permute(Dimensions< T > &dimensions, const PermutationVector &perm)
Permutes given Dimensions according to a permutation vector.
void mark_as_unused() const
Marks a tensor as unused.
1 channel, 1 S32 per channel
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
ITensorInfo & set_data_layout(const DataLayout &data_layout) override
Set the data layout of the tensor.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
Padding and stride information class.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
CLDepthwiseConvolutionLayer(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default constructor.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Num samples, channels, height, width.
size_t y() const
Semantic accessor for height as y.
Compute descriptor used by the depthwise convolution native kernel.
quantized, symmetric per channel fixed-point 8-bit number
void allocate() override
Allocate size specified by TensorInfo of OpenCL memory.
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
~CLDepthwiseConvolutionLayer()
Default destructor.
Memory group resources scope handling class.
Interface for OpenCL tensor.
GPUTarget
Available GPU Targets.
Class for specifying the size of an image or rectangle.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers=nullptr, const ITensorInfo *output_shifts=nullptr)
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
Set the input and output tensors.
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
Static function to check if given info will lead to a valid configuration of CLPermute.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
unsigned int pad_bottom() const
Get the bottom padding.
unsigned int pad_left() const
Get the left padding.
void unmap()
Enqueue an unmap operation of the allocated and mapped buffer.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, ActivationLayerInfo act_info=ActivationLayerInfo(), const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of CLDepthwiseConvolutionLa...
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
Interface for the kernel to run a MxN depthwise convolution.
Basic implementation of the OpenCL tensor interface.