51 Status validate_arguments(
const ITensorInfo *
src,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *
dst,
52 const PadStrideInfo &
conv_info,
const ActivationLayerInfo &act_info)
63 ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
"Weights feature map dimension should match the respective src's one");
69 ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
"Strides larger than 3 not supported for 1x1 convolution.");
70 ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2,
71 "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
76 ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
77 "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
82 "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
97 "Biases size and number of dst feature maps should match");
99 "Biases should be one dimensional");
103 if(dst->total_size() != 0)
113 const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
114 const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
115 const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
117 float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
118 int output_multiplier = 0;
119 int output_shift = 0;
132 && (kernel_size <= 5)
133 && (conv_stride_x == 1) && (conv_stride_y == 1)
138 inline void setup_num_elems_nchw(
unsigned int &num_elems_read_per_iteration_x,
unsigned int &num_elems_read_per_iteration_y,
139 unsigned int &num_elems_written_per_iteration_x,
unsigned int &num_elems_written_per_iteration_y,
140 unsigned int kernel_size,
const PadStrideInfo &conv_info,
const GPUTarget target, ITensorInfo *src)
142 const DataType data_type = src->data_type();
143 const DataLayout data_layout = src->data_layout();
144 unsigned int conv_stride_x = std::get<0>(conv_info.stride());
145 unsigned int conv_stride_y = std::get<1>(conv_info.stride());
147 const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
149 if(run_optimized_bifrost)
156 num_elems_read_per_iteration_x = 4;
157 num_elems_read_per_iteration_y = 4;
158 num_elems_written_per_iteration_x = 4;
159 num_elems_written_per_iteration_y = 4;
164 num_elems_read_per_iteration_x = 6;
165 num_elems_read_per_iteration_y = 5;
166 num_elems_written_per_iteration_x = 4;
167 num_elems_written_per_iteration_y = 3;
172 num_elems_read_per_iteration_x = 8;
173 num_elems_read_per_iteration_y = 6;
174 num_elems_written_per_iteration_x = 4;
175 num_elems_written_per_iteration_y = 2;
186 num_elems_read_per_iteration_y = kernel_size;
187 num_elems_written_per_iteration_x = 8;
188 num_elems_written_per_iteration_y = 1;
192 switch(conv_stride_x)
195 num_elems_read_per_iteration_x = 8;
198 num_elems_read_per_iteration_x = 16;
201 switch(src->element_size())
204 num_elems_read_per_iteration_x = 28;
207 num_elems_read_per_iteration_x = 24;
210 num_elems_read_per_iteration_x = 22;
221 switch(conv_stride_x)
224 num_elems_read_per_iteration_x = 10;
227 num_elems_read_per_iteration_x = 17;
234 switch(conv_stride_x)
237 num_elems_read_per_iteration_x = 12;
240 num_elems_read_per_iteration_x = 20;
247 switch(conv_stride_x)
250 num_elems_read_per_iteration_x = 16;
253 num_elems_read_per_iteration_x = 24;
265 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
const PadStrideInfo &conv_info,
const GPUTarget target)
267 const DataLayout data_layout = src->data_layout();
276 src->quantization_info());
280 const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
281 unsigned int num_rows = 1
U;
282 if(dst->tensor_shape()[0] > 16)
289 return std::make_pair(Status{}, win);
294 const unsigned int kernel_size = weights->dimension(width_idx);
296 unsigned int num_elems_read_per_iteration_x = 0;
297 unsigned int num_elems_read_per_iteration_y = 0;
298 unsigned int num_elems_written_per_iteration_x = 0;
299 unsigned int num_elems_written_per_iteration_y = 0;
303 unsigned int conv_stride_x = std::get<0>(conv_info.stride());
304 unsigned int conv_stride_y = std::get<1>(conv_info.stride());
306 setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
307 num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
308 kernel_size, conv_info, target, src);
311 bool window_changed =
false;
312 Window win =
calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
314 AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
315 AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size);
316 AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
318 output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
320 return std::make_pair(err, win);
328 bool export_to_cl_image_support(ITensorInfo *tensor,
GPUTarget gpu_target,
DataLayout data_layout)
358 const size_t image_w = tensor->tensor_shape()[0] / 4;
359 const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
363 if(image_w > max_image_w || image_h > max_image_h)
391 const int conv_stride_x = std::get<0>(conv_info.
stride());
392 const int conv_stride_y = std::get<1>(conv_info.
stride());
400 const unsigned int kernel_size = weights->
dimension(width_idx);
406 auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
408 ICLKernel::configure_internal(win_config.second);
417 kernel_name <<
"direct_convolution_nhwc";
419 const unsigned int n0 = win_config.second.x().step();
420 const unsigned int m0 = win_config.second.y().step();
422 const unsigned int partial_store_n0 = dst->
dimension(channel_idx) % n0;
423 const unsigned int pad_left = conv_info.
pad_left();
424 const unsigned int pad_top = conv_info.
pad_top();
425 const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
428 if(export_to_cl_image)
433 if(biases !=
nullptr)
435 build_options.
add_option(std::string(
"-DHAS_BIAS"));
439 build_options.
add_option(
"-cl-fast-relaxed-math");
440 build_options.
add_option(
"-DSRC_TENSOR_TYPE=BUFFER");
445 build_options.
add_option(
"-DDST_TENSOR_TYPE=BUFFER");
450 build_options.
add_option_if_else(export_to_cl_image,
"-DWEI_TENSOR_TYPE=IMAGE",
"-DWEI_TENSOR_TYPE=BUFFER");
472 zero_value.
get(zero_value_s32);
475 int output_multiplier = 0;
476 int output_shift = 0;
502 kernel_name <<
"direct_convolution" << kernel_size <<
"x" << kernel_size;
504 build_options.
add_option_if(biases !=
nullptr, std::string(
"-DHAS_BIAS"));
506 const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
508 if(run_optimized_for_bifrost)
512 kernel_name <<
"_f32_bifrost";
529 int output_multiplier = 0;
530 int output_shift = 0;
539 kernel_name.str(
"direct_convolution_quantized");
547 _config_id = kernel_name.str();
596 cl::Image2D weights_cl_image;
598 const size_t dim_y_collapsed =
ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.
y().
step());
604 if(export_to_cl_image)
606 const size_t image_w = weights->info()->dimension(0) / 4;
607 const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
609 const size_t image_row_pitch = weights->info()->strides_in_bytes()[1];
615 unsigned int idx = 0;
618 if(export_to_cl_image)
620 _kernel.setArg(idx++, weights_cl_image);
623 if(biases !=
nullptr)
649 if(biases !=
nullptr)
656 _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
660 unsigned int idx = 0;
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Class describing the value of a pixel for any image format.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)
bool image2d_from_buffer_supported(const cl::Device &device)
Helper function to check whether the cl_khr_image2d_from_buffer extension is supported.
const Window & window() const
The maximum window the kernel can be executed on.
const size_t conv_pad_left
bool enabled() const
Check if initialised.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
const StringSet & options() const
Gets the current options list set.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
constexpr int step() const
Return the step of the dimension.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
float a() const
Get the alpha value.
void get(uint8_t &v) const
Interpret the pixel value as a U8.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
const std::string & string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
Translates a given activation function to a string.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target)
Static function to check if given info will lead to a valid configuration.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
unsigned int pad_top() const
Get the top padding.
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
std::string lower_string(const std::string &val)
Lower a given string.
Activation Layer Information class.
std::set< std::string > build_options
void update_padding_for_cl_image(ITensorInfo *tensor)
Update padding required to export the OpenCL buffer to OpenCL image2d.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx...
void use_tensor_dimensions(const TensorShape &shape, size_t first_dimension=Window::DimX)
Use the tensor's dimensions to fill the window dimensions.
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
Set the src, weights, biases and dst tensors info.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
void add_option(std::string option)
Adds option to the existing build option list.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
std::string get_data_size_from_data_type(const DataType &dt)
Get the size of a data type in number of bits.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
quantized, asymmetric fixed-point 8-bit number unsigned
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
GPUTarget get_target() const
Get the targeted GPU architecture.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
Padding and stride information class.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
virtual PaddingSize padding() const =0
Padding of tensor.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
BorderSize border_size() const override
The size of the border for that kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
const size_t conv_stride_x
Num samples, channels, height, width.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
size_t get_cl_image_pitch_alignment(const cl::Device &device)
Helper function to get the cl_image pitch alignment in pixels.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
const std::string & string_from_data_layout(DataLayout dl)
Convert a data layout identity into a string.
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
const size_t conv_pad_top
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
GPUTarget
Available GPU Targets.
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
Create a cl::Image2D object from an OpenCL buffer.
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size.
ActivationFunction activation() const
Get the type of activation function.
float b() const
Get the beta value.
quantized, asymmetric fixed-point 8-bit number signed
void adjust(size_t dimension, int adjust_value, bool is_at_start)
Adjust the start or end of a given dimension by the given value.
const size_t conv_stride_y
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx...
Window first_slice_window_3D() const
First 3D slice of the window.
DataType
Available data types.
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx...
unsigned int pad_left() const
Get the left padding.
DataLayout
[DataLayout enum definition]
Describe a multidimensional execution window.
TensorShape compute_deep_convolution_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
Calculate the deep convolution shape output shape of a tensor.
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target, Args... targets)
Helper function to check whether a gpu target is equal to the provided targets.
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
const cl::Device & get_device()
Gets the CL device for which the programs are created.
void add_option_if_else(bool cond, std::string option_true, std::string option_false)
Adds first option if condition is true else the second one.