51 Status validate_arguments(
const ITensorInfo *
src,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *
dst,
52 const PadStrideInfo &
conv_info,
const ActivationLayerInfo &act_info)
65 "Weights feature map dimension should match the respective src's one");
67 ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
"Strides larger than 3 not supported for 1x1 convolution.");
69 && std::get<0>(conv_info.stride()) > 2,
70 "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
72 "Activation supported only for floating point and NHWC.");
78 ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
79 "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
84 "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
99 "Biases size and number of src feature maps should match");
101 "Biases should be one dimensional");
105 if(dst->total_size() != 0)
115 const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
116 const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
117 const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
119 float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
120 int output_multiplier = 0;
121 int output_shift = 0;
134 && (kernel_size <= 5)
135 && (conv_stride_x == 1) && (conv_stride_y == 1)
140 inline void setup_num_elems_nchw(
unsigned int &num_elems_read_per_iteration_x,
unsigned int &num_elems_read_per_iteration_y,
141 unsigned int &num_elems_written_per_iteration_x,
unsigned int &num_elems_written_per_iteration_y,
142 unsigned int kernel_size,
const PadStrideInfo &conv_info,
const GPUTarget target, ITensorInfo *src)
144 const DataType data_type = src->data_type();
145 const DataLayout data_layout = src->data_layout();
146 unsigned int conv_stride_x = std::get<0>(conv_info.stride());
147 unsigned int conv_stride_y = std::get<1>(conv_info.stride());
149 const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);
151 if(run_optimized_bifrost)
158 num_elems_read_per_iteration_x = 4;
159 num_elems_read_per_iteration_y = 4;
160 num_elems_written_per_iteration_x = 4;
161 num_elems_written_per_iteration_y = 4;
166 num_elems_read_per_iteration_x = 6;
167 num_elems_read_per_iteration_y = 5;
168 num_elems_written_per_iteration_x = 4;
169 num_elems_written_per_iteration_y = 3;
174 num_elems_read_per_iteration_x = 8;
175 num_elems_read_per_iteration_y = 6;
176 num_elems_written_per_iteration_x = 4;
177 num_elems_written_per_iteration_y = 2;
188 num_elems_read_per_iteration_y = kernel_size;
189 num_elems_written_per_iteration_x = 8;
190 num_elems_written_per_iteration_y = 1;
194 switch(conv_stride_x)
197 num_elems_read_per_iteration_x = 8;
200 num_elems_read_per_iteration_x = 16;
203 switch(src->element_size())
206 num_elems_read_per_iteration_x = 28;
209 num_elems_read_per_iteration_x = 24;
212 num_elems_read_per_iteration_x = 22;
223 switch(conv_stride_x)
226 num_elems_read_per_iteration_x = 10;
229 num_elems_read_per_iteration_x = 17;
236 switch(conv_stride_x)
239 num_elems_read_per_iteration_x = 12;
242 num_elems_read_per_iteration_x = 20;
249 switch(conv_stride_x)
252 num_elems_read_per_iteration_x = 16;
255 num_elems_read_per_iteration_x = 24;
267 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
const PadStrideInfo &conv_info,
const GPUTarget target)
269 const DataLayout data_layout = src->data_layout();
278 src->quantization_info());
282 const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u);
283 unsigned int num_rows = 1
U;
284 if(dst->tensor_shape()[0] > 16)
291 return std::make_pair(Status{}, win);
296 const unsigned int kernel_size = weights->dimension(width_idx);
298 unsigned int num_elems_read_per_iteration_x = 0;
299 unsigned int num_elems_read_per_iteration_y = 0;
300 unsigned int num_elems_written_per_iteration_x = 0;
301 unsigned int num_elems_written_per_iteration_y = 0;
305 unsigned int conv_stride_x = std::get<0>(conv_info.stride());
306 unsigned int conv_stride_y = std::get<1>(conv_info.stride());
308 setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
309 num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
310 kernel_size, conv_info, target, src);
313 bool window_changed =
false;
314 Window win =
calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));
316 AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
317 AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size);
318 AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
320 output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
322 return std::make_pair(err, win);
330 bool export_to_cl_image_support(ITensorInfo *tensor,
GPUTarget gpu_target,
DataLayout data_layout)
360 const size_t image_w = tensor->tensor_shape()[0] / 4;
361 const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3];
365 if(image_w > max_image_w || image_h > max_image_h)
393 const int conv_stride_x = std::get<0>(conv_info.
stride());
394 const int conv_stride_y = std::get<1>(conv_info.
stride());
402 const unsigned int kernel_size = weights->
dimension(width_idx);
408 auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target);
410 ICLKernel::configure_internal(win_config.second);
419 kernel_name <<
"direct_convolution_nhwc";
421 const unsigned int n0 = win_config.second.x().step();
422 const unsigned int m0 = win_config.second.y().step();
424 const unsigned int partial_store_n0 = dst->
dimension(channel_idx) % n0;
425 const unsigned int pad_left = conv_info.
pad_left();
426 const unsigned int pad_top = conv_info.
pad_top();
427 const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout);
430 if(export_to_cl_image)
435 if(biases !=
nullptr)
437 build_options.
add_option(std::string(
"-DHAS_BIAS"));
441 build_options.
add_option(
"-cl-fast-relaxed-math");
442 build_options.
add_option(
"-DSRC_TENSOR_TYPE=BUFFER");
447 build_options.
add_option(
"-DDST_TENSOR_TYPE=BUFFER");
452 build_options.
add_option_if_else(export_to_cl_image,
"-DWEI_TENSOR_TYPE=IMAGE",
"-DWEI_TENSOR_TYPE=BUFFER");
474 zero_value.
get(zero_value_s32);
477 int output_multiplier = 0;
478 int output_shift = 0;
504 kernel_name <<
"direct_convolution" << kernel_size <<
"x" << kernel_size;
506 build_options.
add_option_if(biases !=
nullptr, std::string(
"-DHAS_BIAS"));
508 const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout);
510 if(run_optimized_for_bifrost)
514 kernel_name <<
"_f32_bifrost";
531 int output_multiplier = 0;
532 int output_shift = 0;
541 kernel_name.str(
"direct_convolution_quantized");
549 _config_id = kernel_name.str();
598 cl::Image2D weights_cl_image;
600 const size_t dim_y_collapsed =
ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.
y().
step());
606 if(export_to_cl_image)
608 const size_t image_w = weights->info()->dimension(0) / 4;
609 const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
611 const size_t image_row_pitch = weights->info()->strides_in_bytes()[1];
617 unsigned int idx = 0;
620 if(export_to_cl_image)
622 _kernel.setArg(idx++, weights_cl_image);
625 if(biases !=
nullptr)
651 if(biases !=
nullptr)
658 _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
662 unsigned int idx = 0;
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Class describing the value of a pixel for any image format.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)
bool image2d_from_buffer_supported(const cl::Device &device)
Helper function to check whether the cl_khr_image2d_from_buffer extension is supported.
const Window & window() const
The maximum window the kernel can be executed on.
const size_t conv_pad_left
bool enabled() const
Check if initialised.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint=CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items=false)
Add the kernel to the command queue with the given window.
const StringSet & options() const
Gets the current options list set.
TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
Calculate the deep convolution shape output shape of a tensor.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
constexpr int step() const
Return the step of the dimension.
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
float a() const
Get the alpha value.
void get(uint8_t &v) const
Interpret the pixel value as a U8.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
const DataLayout data_layout
const std::string & string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
Translates a given activation function to a string.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target)
Static function to check if given info will lead to a valid configuration.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...
unsigned int pad_top() const
Get the top padding.
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
GPUTarget get_arch_from_target(GPUTarget target)
Helper function to get the GPU arch.
std::string lower_string(const std::string &val)
Lower a given string.
Activation Layer Information class.
std::set< std::string > build_options
void update_padding_for_cl_image(ITensorInfo *tensor)
Update padding required to export the OpenCL buffer to OpenCL image2d.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx...
void use_tensor_dimensions(const TensorShape &shape, size_t first_dimension=Window::DimX)
Use the tensor's dimensions to fill the window dimensions.
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
Set the src, weights, biases and dst tensors info.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
void add_option(std::string option)
Adds option to the existing build option list.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
std::string get_data_size_from_data_type(const DataType &dt)
Get the size of a data type in number of bits.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
quantized, asymmetric fixed-point 8-bit number unsigned
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
GPUTarget get_target() const
Get the targeted GPU architecture.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
Padding and stride information class.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
virtual PaddingSize padding() const =0
Padding of tensor.
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
BorderSize border_size() const override
The size of the border for that kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
const size_t conv_stride_x
Num samples, channels, height, width.
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
size_t get_cl_image_pitch_alignment(const cl::Device &device)
Helper function to get the cl_image pitch alignment in pixels.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
const std::string & string_from_data_layout(DataLayout dl)
Convert a data layout identity into a string.
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
const size_t conv_pad_top
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
GPUTarget
Available GPU Targets.
void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch)
Create a cl::Image2D object from an OpenCL buffer.
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size.
ActivationFunction activation() const
Get the type of activation function.
float b() const
Get the beta value.
quantized, asymmetric fixed-point 8-bit number signed
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
void adjust(size_t dimension, int adjust_value, bool is_at_start)
Adjust the start or end of a given dimension by the given value.
const size_t conv_stride_y
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx...
Window first_slice_window_3D() const
First 3D slice of the window.
DataType
Available data types.
void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx...
unsigned int pad_left() const
Get the left padding.
DataLayout
[DataLayout enum definition]
Describe a multidimensional execution window.
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target, Args... targets)
Helper function to check whether a gpu target is equal to the provided targets.
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
const cl::Device & get_device()
Gets the CL device for which the programs are created.
void add_option_if_else(bool cond, std::string option_true, std::string option_false)
Adds first option if condition is true else the second one.