42 template <
unsigned int kernel_size>
44 : _input(nullptr), _bias(nullptr), _weights(nullptr), _output(nullptr), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_pad_x(0), _conv_pad_y(0), _lws(gles::NDRange(1
U, 1
U, 1
U))
48 template <
unsigned int kernel_size>
54 template <
unsigned int kernel_size>
62 ARM_COMPUTE_ERROR_ON_MSG((kernel_size == 3 && std::get<0>(conv_info.
stride()) > 2),
"Strides larger than 2 not supported in 3x3 direct convolution!");
75 unsigned int owidth = 0;
76 unsigned int oheight = 0;
80 output_shape.
set(0, owidth);
81 output_shape.
set(1, oheight);
92 _conv_stride_x = std::get<0>(conv_info.
stride());
93 _conv_stride_y = std::get<1>(conv_info.
stride());
94 _conv_pad_x = std::get<0>(conv_info.
pad());
95 _conv_pad_y = std::get<1>(conv_info.
pad());
101 _border_size =
BorderSize(_conv_pad_y, _conv_pad_x);
103 std::set<std::string> options;
112 options.emplace((
"#define " + dt_name));
117 options.emplace(
"#define FUSED_ACTIVATION");
124 unsigned int num_elems_read_per_iteration_x = kernel_size * _conv_stride_x;
125 unsigned int num_elems_read_per_iteration_y = 1;
126 unsigned int num_elems_written_per_iteration_x = 1;
127 unsigned int num_elems_written_per_iteration_y = 1;
128 unsigned int num_elems_written_per_iteration_z = 1;
132 if((_conv_stride_x == 1) && (_conv_stride_y == 1))
138 #define PROCESS_4X_3Y_1Z 140 #if defined(PROCESS_8X_3Y_1Z) 141 options.emplace(
"#define PROCESS_8X_3Y_1Z");
142 num_elems_read_per_iteration_x = 16;
143 num_elems_read_per_iteration_y = 5;
144 num_elems_written_per_iteration_x = 8;
145 num_elems_written_per_iteration_y = 3;
146 #elif defined(PROCESS_4X_3Y_1Z) 147 options.emplace(
"#define PROCESS_4X_3Y_1Z");
148 num_elems_read_per_iteration_x = 8;
149 num_elems_read_per_iteration_y = 5;
150 num_elems_written_per_iteration_x = 4;
151 num_elems_written_per_iteration_y = 3;
152 #elif defined(PROCESS_4X_4Y_1Z) 153 options.emplace(
"#define PROCESS_4X_4Y_1Z");
154 num_elems_read_per_iteration_x = 8;
155 num_elems_read_per_iteration_y = 6;
156 num_elems_written_per_iteration_x = 4;
157 num_elems_written_per_iteration_y = 4;
158 #elif defined(PROCESS_4X_3Y_2Z) 159 options.emplace(
"#define PROCESS_4X_3Y_2Z");
160 num_elems_read_per_iteration_x = 8;
161 num_elems_read_per_iteration_y = 5;
162 num_elems_written_per_iteration_x = 4;
163 num_elems_written_per_iteration_y = 3;
164 num_elems_written_per_iteration_z = 2;
166 #undef PROCESS_8X_3Y_1Z 167 #undef PROCESS_4X_3Y_1Z 168 #undef PROCESS_4X_4Y_1Z 169 #undef PROCESS_4X_3Y_2Z 173 options.emplace(
"#define PROCESS_4X_3Y_1Z");
174 num_elems_read_per_iteration_x = 8;
175 num_elems_read_per_iteration_y = 5;
176 num_elems_written_per_iteration_x = 4;
177 num_elems_written_per_iteration_y = 3;
191 options.emplace(
"#define PROCESS_4X_1Y_1Z");
192 num_elems_read_per_iteration_x = 8;
193 num_elems_written_per_iteration_x = 4;
198 #define PROCESS_4X_1Y_1Z 200 #if defined(PROCESS_1X_1Y_1Z) 201 options.emplace(
"#define PROCESS_1X_1Y_1Z");
202 num_elems_read_per_iteration_x = 3;
203 num_elems_written_per_iteration_x = 1;
204 #elif defined(PROCESS_4X_1Y_1Z) 205 options.emplace(
"#define PROCESS_4X_1Y_1Z");
206 num_elems_read_per_iteration_x = 8;
207 num_elems_written_per_iteration_x = 4;
208 #elif defined(PROCESS_8X_1Y_1Z) 209 options.emplace(
"#define PROCESS_8X_1Y_1Z");
210 num_elems_read_per_iteration_x = 12;
211 num_elems_written_per_iteration_x = 8;
213 #error Have to declare how many elements to process in one thread. 215 #undef PROCESS_1X_1Y_1Z 216 #undef PROCESS_4X_1Y_1Z 217 #undef PROCESS_8X_1Y_1Z 226 else if(kernel_size == 1)
230 options.emplace(
"#define WEIGHTS_OPTIMIZATION");
235 #define PROCESS_8X_2Y_1Z 237 #if defined(PROCESS_4X_1Y_1Z) 238 options.emplace(
"#define PROCESS_4X_1Y_1Z");
239 num_elems_read_per_iteration_x = 4;
240 num_elems_written_per_iteration_x = 4;
241 #elif defined(PROCESS_4X_2Y_1Z) 242 options.emplace(
"#define PROCESS_4X_2Y_1Z");
243 num_elems_read_per_iteration_x = 4;
244 num_elems_read_per_iteration_y = 2;
245 num_elems_written_per_iteration_x = 4;
246 num_elems_written_per_iteration_y = 2;
247 #elif defined(PROCESS_4X_3Y_1Z) 248 options.emplace(
"#define PROCESS_4X_3Y_1Z");
249 num_elems_read_per_iteration_x = 4;
250 num_elems_read_per_iteration_y = 3;
251 num_elems_written_per_iteration_x = 4;
252 num_elems_written_per_iteration_y = 3;
253 #elif defined(PROCESS_4X_4Y_1Z) 254 options.emplace(
"#define PROCESS_4X_4Y_1Z");
255 num_elems_read_per_iteration_x = 4;
256 num_elems_read_per_iteration_y = 4;
257 num_elems_written_per_iteration_x = 4;
258 num_elems_written_per_iteration_y = 4;
259 #elif defined(PROCESS_4X_2Y_2Z) 261 options.emplace(
"#define PROCESS_4X_2Y_2Z");
262 num_elems_read_per_iteration_x = 4;
263 num_elems_read_per_iteration_y = 2;
264 num_elems_written_per_iteration_x = 4;
265 num_elems_written_per_iteration_y = 2;
266 num_elems_written_per_iteration_z = 2;
267 #elif defined(PROCESS_8X_1Y_1Z) 268 options.emplace(
"#define PROCESS_8X_1Y_1Z");
269 num_elems_read_per_iteration_x = 8;
270 num_elems_written_per_iteration_x = 8;
271 #elif defined(PROCESS_8X_2Y_1Z) 272 options.emplace(
"#define PROCESS_8X_2Y_1Z");
273 num_elems_read_per_iteration_x = 8;
274 num_elems_read_per_iteration_y = 2;
275 num_elems_written_per_iteration_x = 8;
276 num_elems_written_per_iteration_y = 2;
278 #error Have to declare how many elements to process in one thread. 280 #undef PROCESS_4X_1Y_1Z 281 #undef PROCESS_4X_2Y_1Z 282 #undef PROCESS_4X_3Y_1Z 283 #undef PROCESS_4X_4Y_1Z 284 #undef PROCESS_4X_2Y_2Z 285 #undef PROCESS_8X_1Y_1Z 286 #undef PROCESS_8X_2Y_1Z 290 num_elems_read_per_iteration_x = 1;
291 num_elems_written_per_iteration_x = 1;
298 else if(kernel_size == 5)
303 options.emplace(
"#define PROCESS_4X_1Y_1Z");
304 num_elems_read_per_iteration_x = 8;
305 num_elems_written_per_iteration_x = 4;
317 options.emplace(
"#define BIAS");
321 kernel_name <<
"direct_convolution" << kernel_size <<
"x" << kernel_size;
330 const int output_padding_right =
ceil_to_multiple(output_width, num_elems_written_per_iteration_x * _lws[0]) - output_width;
331 const int output_padding_bottom =
ceil_to_multiple(output_height, num_elems_written_per_iteration_y * _lws[1]) - output_height;
336 const int input_total_width = std::max(
int(input->
info()->
padding().
left),
int(_conv_pad_x)) + input_width + std::max(
int(input->
info()->
padding().
right),
int(_conv_pad_x));
337 const int input_total_height = std::max(
int(input->
info()->
padding().
top),
int(_conv_pad_y)) + input_height + std::max(
int(input->
info()->
padding().
bottom),
int(_conv_pad_y));
338 const int padding_right1 =
ceil_to_multiple(input_total_width, num_elems_read_per_iteration_x * _lws[0]) - input_width - _conv_pad_x;
339 const int padding_bottom1 =
ceil_to_multiple(input_total_height, num_elems_read_per_iteration_y * _lws[1]) - input_height - _conv_pad_y;
341 const int upper_bound_w =
ceil_to_multiple(((output_width + output_padding_right) * _conv_stride_x + (kernel_size - 1)), num_elems_read_per_iteration_x * _lws[0]) - _conv_pad_x -
input_width;
342 const int upper_bound_h =
ceil_to_multiple(((output_height + output_padding_bottom) * _conv_stride_y + (kernel_size - 1)), num_elems_read_per_iteration_y * _lws[1]) - _conv_pad_y -
input_height;
343 const int padding_right2 = std::max(upper_bound_w, _conv_pad_x);
344 const int padding_bottom2 = std::max(upper_bound_h, _conv_pad_y);
346 const int padding_right = std::max(padding_right1, padding_right2);
347 const int padding_bottom = std::max(padding_bottom1, padding_bottom2);
353 AccessWindowStatic input_access(input->
info(), -_conv_pad_x, -_conv_pad_y, input_width + padding_right, input_height + padding_bottom);
360 if((weights->
info()->
dimension(2) % 2 != 0) || (kernel_size != 1))
383 AccessWindowStatic output_access(output->
info(), 0, 0, output_width + output_padding_right, output_height + output_padding_bottom);
399 IGCKernel::configure(win);
402 template <
unsigned int kernel_size>
437 unsigned int idx = 0;
442 _kernel.update_shader_params();
unsigned int top
top of the border
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
const Window & window() const
The maximum window the kernel can be executed on.
void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx...
void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws=gles::NDRange(1U, 1U, 1U))
Add the kernel to the command queue with the given window.
bool enabled() const
Check if initialised.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
void shift(size_t dimension, int shift_value)
Shift the values of a given dimension by the given shift_value.
Container for 2D border size.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
constexpr int step() const
Return the step of the dimension.
float a() const
Get the alpha value.
std::string to_string(T &&value)
Convert integer and float values to string.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
const size_t input_height
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const std::string & string_from_activation_func(ActivationLayerInfo::ActivationFunction act)
Translates a given activation function to a string.
Interface for GLES Compute tensor.
unsigned int bottom
bottom of the border
unsigned int num_arguments_per_1D_tensor() const
Returns the number of arguments enqueued per 1D tensor object.
unsigned int num_arguments_per_3D_tensor() const
Returns the number of arguments enqueued per 3D tensor object.
std::string lower_string(const std::string &val)
Lower a given string.
Activation Layer Information class.
void use_tensor_dimensions(const TensorShape &shape, size_t first_dimension=Window::DimX)
Use the tensor's dimensions to fill the window dimensions.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
std::pair< unsigned int, unsigned int > scaled_dimensions(int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info, const Size2D &dilation=Size2D(1U, 1U))
Returns expected width and height of output scaled tensor depending on dimensions rounding mode...
Implementation of a static rectangular access pattern.
void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Set the input and output of the kernel.
Interface for the direct convolution kernel.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
bool padding_is_symmetric() const
Check whether the padding is symmetric.
std::string float_to_string_with_full_precision(float val)
Create a string with the float in full precision.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...)
void run(const Window &window) override
Enqueue the OpenGL ES shader to process the given window.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Manages all the GLES kernels compilation and caching, provides accessors for the GLES Context...
Class to describe a number of elements in each dimension.
void set_needs_shifting(bool needs_shifting)
Set the flag indicating whether or not a tensor needs shifting.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
BorderSize border_size() const override
The size of the border for that kernel.
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Padding and stride information class.
virtual PaddingSize padding() const =0
Padding of tensor.
unsigned int left
left of the border
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
unsigned int right
right of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
static GCKernelLibrary & get()
Get the static instance of GCKernelLibrary.
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...)
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps, BorderSize border_size)
GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set={}) const
Creates a kernel from the kernel library.
std::pair< unsigned int, unsigned int > pad() const
Get the padding.
ActivationFunction activation() const
Get the type of activation function.
float b() const
Get the beta value.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Container for valid region of a window.
GCDirectConvolutionLayerKernel()
Default constructor.
void adjust(size_t dimension, int adjust_value, bool is_at_start)
Adjust the start or end of a given dimension by the given value.
Window first_slice_window_3D() const
First 3D slice of the window.
Describe a multidimensional execution window.
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window)
Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx...
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.