Compute Library
 23.08
im2col.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 #if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
26 
27 #if ELEMENT_SIZE == 1
28 #define COND_DATA_TYPE char
29 #elif ELEMENT_SIZE == 2
30 #define COND_DATA_TYPE short
31 #elif ELEMENT_SIZE == 4
32 #define COND_DATA_TYPE int
33 #else // ELEMENT_SIZE
34 #error "Element size not support"
35 #endif // ELEMENT_SIZE
36 
37 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
38 /** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
39  *
40  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
41  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
42  * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
43  * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
44  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
45  * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
46  *
47  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
48  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
49  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
50  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
51  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
52  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
53  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
54  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
55  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
56  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
57  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
58  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
59  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
60  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
61  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
62  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
63  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
64  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
65  */
66 __kernel void im2col1x1_stridex1_nchw(
68 #if defined(NUM_GROUPS)
70 #else // defined(NUM_GROUPS)
72 #endif // defined(NUM_GROUPS)
73  uint src_stride_w,
74  uint dst_stride_w)
75 {
76  const uint xc = get_global_id(0) * 4; // x coordinate in the convolved tensor
77  const uint yc = get_global_id(1); // y coordinate in the convolved tensor
78  const uint ch = get_global_id(2) % SRC_DEPTH; // input feature map
79  const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
80 
81  // Clamp xc
82  // The strategy clamps at "xc" as it will be a valid value for sure
83  uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
84 
85  // Check which values are valid
86  const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
87 
88  xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
89 
90  // Calculate input indices
91  const uint xi = xc;
92  const uint yi = yc * STRIDE_Y;
93 
94  // Calculate output indices
95 
96 #if defined(NUM_GROUPS)
97  const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
98  const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
99 #else // defined(NUM_GROUPS)
100  const uint xo = ch;
101 #endif // defined(NUM_GROUPS)
102  const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
103 
104  // Get input and output address
105  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
106 #if defined(NUM_GROUPS)
107  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
108 #else // defined(NUM_GROUPS)
109  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
110 #endif // defined(NUM_GROUPS)
111 
112  VEC_DATA_TYPE(DATA_TYPE, 4)
113  data = vload4(0, (__global DATA_TYPE *)input_ptr);
114 
115  // If out-of-bound, overwrite with the first element
116  data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
117 
118  *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
119  *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
120  *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
121  *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
122 
123 #ifdef HAS_BIAS
124 #if defined(NUM_GROUPS)
125  if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
126 #else // defined(NUM_GROUPS)
127  if(ch == (SRC_DEPTH - 1))
128 #endif // defined(NUM_GROUPS)
129  {
130  *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
131  *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
132  *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
133  *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
134  }
135 #endif // HAS_BIAS
136 }
137 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
138 
139 #if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
140 #if defined(DILATION_X) && defined(DILATION_Y)
141 /** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
142  *
143  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
144  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
145  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
146  * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
147  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
148  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
149  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
150  * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
151  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
152  * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
153  *
154  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
155  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
156  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
157  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
158  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
159  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
160  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
161  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
162  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
163  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
164  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
165  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
166  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
167  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
168  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
169  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
170  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
171  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
172  */
173 __kernel void im2col_generic_nchw(
175 #if defined(NUM_GROUPS)
177 #else // defined(NUM_GROUPS)
179 #endif // defined(NUM_GROUPS)
180  uint src_stride_w,
181  uint dst_stride_w)
182 {
183  const int xc = get_global_id(0); // x coordinate in the convolved tensor
184  const int yc = get_global_id(1); // y coordinate in the convolved tensor
185  const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
186  const int batch = get_global_id(2) / SRC_DEPTH; // batch size
187 
188  // Calculate input indices
189  const int xi = xc * STRIDE_X - PAD_LEFT;
190  const int yi = yc * STRIDE_Y - PAD_TOP;
191 
192  // Calculate output indices
193 #if defined(NUM_GROUPS)
194  const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
195  const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
196 #else // defined(NUM_GROUPS)
197  const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
198 #endif // defined(NUM_GROUPS)
199  const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
200 
201  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
202 #if defined(NUM_GROUPS)
203  __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
204 #else // defined(NUM_GROUPS)
205  __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
206 #endif // defined(NUM_GROUPS)
207 
208  // Linearize convolution elements
209  for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
210  {
211  int y = yi + yk * DILATION_Y;
212  for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
213  {
214  int x = xi + xk * DILATION_X;
215 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
216  *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
217 #else // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
218  if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
219  {
220  *output_ptr = PAD_VALUE;
221  }
222  else
223  {
224  *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
225  }
226 #endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
227  }
228  }
229 
230 #ifdef HAS_BIAS
231 #if defined(NUM_GROUPS)
232  if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
233 #else // defined(NUM_GROUPS)
234  if(ch == (SRC_DEPTH - 1))
235 #endif // defined(NUM_GROUPS)
236  {
237  *output_ptr = 1.0f;
238  }
239 #endif // HAS_BIAS
240 }
241 #endif // defined(DILATION_X) && defined(DILATION_Y)
242 
243 /** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
244  *
245  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
246  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
247  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
248  * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
249  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
250  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
251  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
252  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
253  *
254  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
255  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
256  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
257  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
258  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
259  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
260  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
261  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
262  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
263  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
264  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
265  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
266  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
267  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
268  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
269  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
270  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
271  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
272  */
273 __kernel void im2col3x3_nchw(
275 #if defined(NUM_GROUPS)
277 #else // defined(NUM_GROUPS)
279 #endif // defined(NUM_GROUPS)
280  uint src_stride_w,
281  uint dst_stride_w)
282 {
283  const int xc = get_global_id(0); // x coordinate in the convolved tensor
284  const int yc = get_global_id(1); // y coordinate in the convolved tensor
285  const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
286  const int batch = get_global_id(2) / SRC_DEPTH; // batch size
287 
288  // Calculate input indices
289  const int xi = xc * STRIDE_X - PAD_LEFT;
290  const int yi = yc * STRIDE_Y - PAD_TOP;
291 
292  // Calculate output indices
293 #if defined(NUM_GROUPS)
294  const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
295  const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
296 #else // defined(NUM_GROUPS)
297  const int xo = ch * 9; // 3x3
298 #endif // defined(NUM_GROUPS)
299  const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
300 
301  // Get input and output address
302  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
303 #if defined(NUM_GROUPS)
304  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
305 #else // defined(NUM_GROUPS)
306  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
307 #endif // defined(NUM_GROUPS)
308 
309  VEC_DATA_TYPE(DATA_TYPE, 3)
310  row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
311  VEC_DATA_TYPE(DATA_TYPE, 3)
312  row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
313  VEC_DATA_TYPE(DATA_TYPE, 3)
314  row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
315 
316 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
317  // Put 0 if the value is out-of-bound
318  int3 x = (int3)xi + (int3)(0, 1, 2);
319  int3 y = (int3)yi + (int3)(0, 1, 2);
320 
321  VEC_DATA_TYPE(COND_DATA_TYPE, 3)
322  cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
323  VEC_DATA_TYPE(COND_DATA_TYPE, 3)
324  cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
325  VEC_DATA_TYPE(COND_DATA_TYPE, 3)
326  cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
327 
328  row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
329  row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
330  row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
331 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
332 
333  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
334  *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
335 
336 #ifdef HAS_BIAS
337 #if defined(NUM_GROUPS)
338  if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
339 #else // defined(NUM_GROUPS)
340  if(ch == (SRC_DEPTH - 1))
341 #endif // defined(NUM_GROUPS)
342  {
343  *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
344  }
345 #endif // HAS_BIAS
346 }
347 
348 /** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
349  *
350  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
351  * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
352  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
353  * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
354  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
355  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
356  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
357  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
358  * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
359  *
360  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
361  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
362  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
363  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
364  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
365  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
366  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
367  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
368  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
369  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
370  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
371  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
372  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
373  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
374  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
375  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
376  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
377  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
378  */
379 __kernel void im2col5x5_nchw(
381 #if defined(NUM_GROUPS)
383 #else // defined(NUM_GROUPS)
385 #endif // defined(NUM_GROUPS)
386  uint src_stride_w,
387  uint dst_stride_w)
388 {
389  const int xc = get_global_id(0); // x coordinate in the convolved tensor
390  const int yc = get_global_id(1); // y coordinate in the convolved tensor
391  const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
392  const int batch = get_global_id(2) / SRC_DEPTH; // batch size
393 
394  // Calculate input indices
395  const int xi = xc * STRIDE_X - PAD_LEFT;
396  const int yi = yc * STRIDE_Y - PAD_TOP;
397 
398  // Calculate output indices
399 #if defined(NUM_GROUPS)
400  const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
401  const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
402 #else // defined(NUM_GROUPS)
403  const int xo = ch * 25; // 5x5
404 #endif // defined(NUM_GROUPS)
405  const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
406 
407 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
408  // Put 0 if the value is out-of-bound
409  int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
410  int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
411  int x1 = xi + 4;
412  int y1 = yi + 4;
413 
414  // Check if we could have out-of-bounds elements in the x direction
415  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
416  x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
417  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
418  y0_condition = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
419  COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
420  COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
421 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
422 
423  // Get input and output address
424  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
425 #if defined(NUM_GROUPS)
426  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
427 #else // defined(NUM_GROUPS)
428  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
429 #endif // defined(NUM_GROUPS)
430 
431  {
432  VEC_DATA_TYPE(DATA_TYPE, 4)
433  row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
434  DATA_TYPE
435  row01 = *((__global DATA_TYPE *)input_ptr + 4);
436 
437  input_ptr += src_stride_y;
438 
439  VEC_DATA_TYPE(DATA_TYPE, 4)
440  row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
441  DATA_TYPE
442  row11 = *((__global DATA_TYPE *)input_ptr + 4);
443 
444 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
445  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
446  cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
447  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
448  cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
449  COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
450  COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
451 
452  // Replace with 0 if the value is not valid
453  row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
454  row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
455  row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
456  row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
457 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
458 
459  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
460  row10.s012),
461  0, (__global DATA_TYPE *)output_ptr);
462  vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
463 
464  input_ptr += src_stride_y;
465  output_ptr += 10 * dst_stride_x;
466  }
467 
468  {
469  VEC_DATA_TYPE(DATA_TYPE, 4)
470  row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
471  DATA_TYPE
472  row01 = *((__global DATA_TYPE *)input_ptr + 4);
473 
474  input_ptr += src_stride_y;
475 
476  VEC_DATA_TYPE(DATA_TYPE, 4)
477  row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
478  DATA_TYPE
479  row11 = *((__global DATA_TYPE *)input_ptr + 4);
480 
481 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
482  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
483  cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
484  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
485  cond10 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
486  COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
487  COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
488 
489  // Replace with 0 if the value is not valid
490  row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
491  row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
492  row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
493  row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
494 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
495 
496  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
497  row10.s012),
498  0, (__global DATA_TYPE *)output_ptr);
499  vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
500 
501  input_ptr += src_stride_y;
502  output_ptr += 10 * dst_stride_x;
503  }
504 
505  {
506  VEC_DATA_TYPE(DATA_TYPE, 4)
507  row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
508  DATA_TYPE
509  row01 = *((__global DATA_TYPE *)input_ptr + 4);
510 
511  input_ptr += src_stride_y;
512 
513 #if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
514  VEC_DATA_TYPE(COND_DATA_TYPE, 4)
515  cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
516  COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
517 
518  // Replace with 0 if the value is not valid
519  row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
520  row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
521 #endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
522 
523  vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
524  *((__global DATA_TYPE *)output_ptr + 4) = row01;
525 
526  output_ptr += 5 * dst_stride_x;
527  }
528 
529 #ifdef HAS_BIAS
530 #if defined(NUM_GROUPS)
531  if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
532 #else // defined(NUM_GROUPS)
533  if(ch == (SRC_DEPTH - 1))
534 #endif // defined(NUM_GROUPS)
535  {
536  *((__global DATA_TYPE *)output_ptr) = 1.0f;
537  }
538 #endif // HAS_BIAS
539 }
540 #endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
541 
542 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
543 /** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
544  *
545  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
546  * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
547  * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
548  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
549  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
550  * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
551  *
552  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
553  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
554  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
555  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
556  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
557  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
558  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
559  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
560  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
561  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
562  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
563  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
564  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
565  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
566  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
567  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
568  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
569  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
570  */
571 __kernel void im2col11x11_padx0_pady0_nchw(
573 #if defined(NUM_GROUPS)
575 #else // defined(NUM_GROUPS)
577 #endif // defined(NUM_GROUPS)
578  uint src_stride_w,
579  uint dst_stride_w)
580 {
581  const int xc = get_global_id(0); // x coordinate in the convolved tensor
582  const int yc = get_global_id(1); // y coordinate in the convolved tensor
583  const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
584  const int batch = get_global_id(2) / SRC_DEPTH; // batch size
585 
586  // Calculate input indices
587  const int xi = xc * STRIDE_X;
588  const int yi = yc * STRIDE_Y;
589 
590  // Calculate output indices
591 #if defined(NUM_GROUPS)
592  const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
593  const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
594 #else // defined(NUM_GROUPS)
595  const int xo = ch * 121; // 11x11
596 #endif // defined(NUM_GROUPS)
597  const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
598 
599  // Get input and output address
600  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
601 #if defined(NUM_GROUPS)
602  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
603 #else // defined(NUM_GROUPS)
604  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
605 #endif // defined(NUM_GROUPS)
606 
607  {
608  VEC_DATA_TYPE(DATA_TYPE, 8)
609  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
610  VEC_DATA_TYPE(DATA_TYPE, 3)
611  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
612 
613  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
614  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
615 
616  input_ptr += src_stride_y;
617  output_ptr += 11 * src_stride_x;
618  }
619 
620  {
621  VEC_DATA_TYPE(DATA_TYPE, 8)
622  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
623  VEC_DATA_TYPE(DATA_TYPE, 3)
624  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
625 
626  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
627  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
628 
629  input_ptr += src_stride_y;
630  output_ptr += 11 * src_stride_x;
631  }
632 
633  {
634  VEC_DATA_TYPE(DATA_TYPE, 8)
635  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
636  VEC_DATA_TYPE(DATA_TYPE, 3)
637  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
638 
639  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
640  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
641 
642  input_ptr += src_stride_y;
643  output_ptr += 11 * src_stride_x;
644  }
645 
646  {
647  VEC_DATA_TYPE(DATA_TYPE, 8)
648  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
649  VEC_DATA_TYPE(DATA_TYPE, 3)
650  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
651 
652  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
653  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
654 
655  input_ptr += src_stride_y;
656  output_ptr += 11 * src_stride_x;
657  }
658 
659  {
660  VEC_DATA_TYPE(DATA_TYPE, 8)
661  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
662  VEC_DATA_TYPE(DATA_TYPE, 3)
663  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
664 
665  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
666  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
667 
668  input_ptr += src_stride_y;
669  output_ptr += 11 * src_stride_x;
670  }
671 
672  {
673  VEC_DATA_TYPE(DATA_TYPE, 8)
674  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
675  VEC_DATA_TYPE(DATA_TYPE, 3)
676  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
677 
678  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
679  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
680 
681  input_ptr += src_stride_y;
682  output_ptr += 11 * src_stride_x;
683  }
684 
685  {
686  VEC_DATA_TYPE(DATA_TYPE, 8)
687  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
688  VEC_DATA_TYPE(DATA_TYPE, 3)
689  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
690 
691  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
692  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
693 
694  input_ptr += src_stride_y;
695  output_ptr += 11 * src_stride_x;
696  }
697 
698  {
699  VEC_DATA_TYPE(DATA_TYPE, 8)
700  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
701  VEC_DATA_TYPE(DATA_TYPE, 3)
702  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
703 
704  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
705  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
706 
707  input_ptr += src_stride_y;
708  output_ptr += 11 * src_stride_x;
709  }
710 
711  {
712  VEC_DATA_TYPE(DATA_TYPE, 8)
713  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
714  VEC_DATA_TYPE(DATA_TYPE, 3)
715  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
716 
717  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
718  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
719 
720  input_ptr += src_stride_y;
721  output_ptr += 11 * src_stride_x;
722  }
723 
724  {
725  VEC_DATA_TYPE(DATA_TYPE, 8)
726  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
727  VEC_DATA_TYPE(DATA_TYPE, 3)
728  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
729 
730  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
731  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
732 
733  input_ptr += src_stride_y;
734  output_ptr += 11 * src_stride_x;
735  }
736 
737  {
738  VEC_DATA_TYPE(DATA_TYPE, 8)
739  row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
740  VEC_DATA_TYPE(DATA_TYPE, 3)
741  row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
742 
743  vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
744  vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
745 
746  output_ptr += 11 * src_stride_x;
747  }
748 
749 #ifdef HAS_BIAS
750 #if defined(NUM_GROUPS)
751  if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
752 #else // defined(NUM_GROUPS)
753  if(ch == (SRC_DEPTH - 1))
754 #endif // defined(NUM_GROUPS)
755  {
756  *((__global DATA_TYPE *)output_ptr) = 1.0f;
757  }
758 #endif // HAS_BIAS
759 }
760 #endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
761 
762 #if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
763 /** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
764  *
765  * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
766  * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
767  * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
768  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
769  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
770  * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
771  *
772  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
773  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
774  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
775  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
776  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
777  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
778  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
779  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
780  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
781  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
782  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
783  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
784  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
785  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
786  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
787  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
788  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
789  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
790  */
791 __kernel void im2col_generic_padx0_pady0_nchw(
793 #if defined(NUM_GROUPS)
795 #else // defined(NUM_GROUPS)
797 #endif // defined(NUM_GROUPS)
798  uint src_stride_w,
799  uint dst_stride_w)
800 {
801  const int xc = get_global_id(0); // x coordinate in the convolved tensor
802  const int yc = get_global_id(1); // y coordinate in the convolved tensor
803  const int ch = get_global_id(2) % SRC_DEPTH; // input feature map
804  const int batch = get_global_id(2) / SRC_DEPTH; // batch size
805 
806  // Calculate input indices
807  const int xi = xc * STRIDE_X;
808  const int yi = yc * STRIDE_Y;
809 
810  // Calculate output indices
811 #if defined(NUM_GROUPS)
812  const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
813  const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
814 #else // defined(NUM_GROUPS)
815  const int xo = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
816 #endif // defined(NUM_GROUPS)
817  const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
818 
819  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
820 #if defined(NUM_GROUPS)
821  __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
822 #else // defined(NUM_GROUPS)
823  __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
824 #endif // defined(NUM_GROUPS)
825 
826  // Linearize convolution elements
827  for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
828  {
829  int last_x = 0;
830  for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
831  {
832  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
833  row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
834  VSTORE(VECTOR_SIZE)
835  (row, 0, output_ptr);
836  last_x = x;
837  }
838  // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
839  // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
840 #if WIDTH_MOD_VECTOR_SIZE == 1
841  *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
842 #elif WIDTH_MOD_VECTOR_SIZE > 1
843  VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
844  row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
845  VSTORE(WIDTH_MOD_VECTOR_SIZE)
846  (row, 0, output_ptr);
847 #endif /* WIDTH_MOD_VECTOR_SIZE */
848  output_ptr += WIDTH_MOD_VECTOR_SIZE;
849  } /* End of loop over KERNEL_HEIGHT */
850 
851 #ifdef HAS_BIAS
852 #if defined(NUM_GROUPS)
853  if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
854 #else // defined(NUM_GROUPS)
855  if(ch == (SRC_DEPTH - 1))
856 #endif // defined(NUM_GROUPS)
857  {
858  *output_ptr = 1.0f;
859  }
860 #endif // HAS_BIAS
861 }
862 #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
863 #endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
arm_compute::test::validation::src
SimpleTensor< float > src
Definition: DFT.cpp:155
arm_compute::test::validation::dst
auto dst
Definition: DFT.cpp:170
VEC_DATA_TYPE
#define VEC_DATA_TYPE(type, size)
Definition: helpers.h:756
VLOAD
#define VLOAD(size)
Definition: helpers.h:204
VSTORE
#define VSTORE(size)
Definition: helpers.h:486
TENSOR3D_DECLARATION
#define TENSOR3D_DECLARATION(name)
Definition: helpers.h:841
arm_compute::test::validation::batch
const unsigned int batch
Definition: GEMMMatrixMultiplyNative.cpp:362
arm_compute::test::validation::reference::select
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
Definition: Select.cpp:38
CONVERT
#define CONVERT(x, type)
Definition: helpers.h:759
IMAGE_DECLARATION
#define IMAGE_DECLARATION(name)
Definition: helpers.h:833
helpers.h