Compute Library
 21.05
direct_convolution1x1.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 
26 #undef CONVERT_SAT
27 
28 #define ADD_OP(a, b) ((a) + (b))
29 #define MUL_OP(a, b) ((a) * (b))
30 #define CONVERT_SAT(a, b) ((a))
31 
32 #if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
33 
34 #if STRIDE_X == 3
35 #define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
36 #define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
37 #elif STRIDE_X == 2
38 #define INPUT_PIXEL(data_size) extract_input_stride2
39 #elif STRIDE_X == 1
40 #define INPUT_PIXEL(data_size) extract_input_stride1
41 #else /* STRIDE_X not equals 1, 2 or 3 */
42 #error "Only support strides 1, 2 and 3"
43 #endif /* STRIDE_X == 3 */
44 
45 /** Extracts a 1D horizontal vector from the input tensor with stride as 1.
46  *
47  * @param[in] input_pixel Pointer to the first pixel.
48  *
49  * @return extracted input values.
50  */
51 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
52 {
53  return vload8(0, input_pixel);
54 }
55 
56 /** Extracts a 1D horizontal vector from the input tensor with stride as 2.
57  *
58  * @param[in] input_pixel Pointer to the first pixel.
59  *
60  * @return extracted input values.
61  */
62 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
63 {
64  VEC_DATA_TYPE(DATA_TYPE, 16)
65  temp = vload16(0, input_pixel);
66  return temp.s02468ace;
67 }
68 
69 /** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
70  *
71  * @param[in] input_pixel Pointer to the first pixel.
72  *
73  * @return extracted input values.
74  */
75 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
76 {
77  VEC_DATA_TYPE(DATA_TYPE, 4)
78  temp1 = vload4(0, input_pixel);
79  VEC_DATA_TYPE(DATA_TYPE, 4)
80  temp2 = vload4(0, input_pixel + 6);
81  VEC_DATA_TYPE(DATA_TYPE, 4)
82  temp3 = vload4(0, input_pixel + 12);
83  VEC_DATA_TYPE(DATA_TYPE, 4)
84  temp4 = vload4(0, input_pixel + 18);
85  return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
86 }
87 
88 /** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
89  *
90  * @param[in] input_pixel Pointer to the first pixel.
91  *
92  * @return extracted input values.
93  */
94 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
95 {
96  VEC_DATA_TYPE(DATA_TYPE, 8)
97  temp1 = vload8(0, input_pixel);
98  VEC_DATA_TYPE(DATA_TYPE, 8)
99  temp2 = vload8(0, input_pixel + 8);
100  VEC_DATA_TYPE(DATA_TYPE, 8)
101  temp3 = vload8(0, input_pixel + 16);
102  return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
103 }
104 
105 /** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
106  *
107  * @param[in] input_pixel Pointer to the first pixel.
108  *
109  * @return extracted input values.
110  */
111 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
112 {
113  VEC_DATA_TYPE(DATA_TYPE, 16)
114  temp1 = vload16(0, input_pixel);
115  VEC_DATA_TYPE(DATA_TYPE, 16)
116  temp2 = vload16(0, input_pixel + 12);
117  return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
118 }
119 
120 /** This kernel performs a direct convolution to convolve the low three dimensions.
121  *
122  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
123  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
124  * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
125  * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
126  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
127  *
128  * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
129  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
130  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
131  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
132  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
133  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
134  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
135  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
136  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
137  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
138  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
139  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
140  * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
141  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
142  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
143  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
144  * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
145  * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
146  * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
147  * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
148  * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
149  * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
150  * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
151  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
152  * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
153  * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
154  * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
155  * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
156  * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
157  */
158 __kernel void direct_convolution1x1(
161  TENSOR3D_DECLARATION(weights),
162 #ifdef HAS_BIAS
163  VECTOR_DECLARATION(biases),
164 #endif /* defined(HAS_BIAS) */
165  unsigned int weights_stride_w)
166 {
168  Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
170 
171 #ifdef HAS_BIAS
172  Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
173 #endif /* defined(HAS_BIAS) */
174 
175  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
176  values = 0;
177 
178  const uint z_index = get_global_id(2);
179 
180  weights.ptr += z_index * weights_stride_w;
181  for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
182  {
183  DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
184  VEC_DATA_TYPE(DATA_TYPE, 8)
185  input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
186  values = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
187  src.ptr += src_stride_z;
188  weights.ptr += weights_stride_z;
189  }
190 
191 #ifdef HAS_BIAS
192  values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
193 #endif /* defined(HAS_BIAS) */
194 
195  vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
196 }
197 #endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
198 
199 #if defined(WEIGHTS_DEPTH)
200 
201 #define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
202  ({ \
203  acc.s0 = mad(src.s0, weight_value, acc.s0); \
204  acc.s1 = mad(src.s1, weight_value, acc.s1); \
205  acc.s2 = mad(src.s2, weight_value, acc.s2); \
206  acc.s3 = mad(src.s3, weight_value, acc.s3); \
207  })
208 
209 /** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
210  *
211  * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
212  * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
213  * @note In case biases, -DHAS_BIAS must to be passed at compile
214  *
215  * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32
216  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
217  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
218  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
219  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
220  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
221  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
222  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
223  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
224  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
225  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
226  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
227  * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
228  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
229  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
230  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
231  * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
232  * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
233  * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
234  * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
235  * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
236  * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
237  * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
238  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
239  * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
240  * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
241  * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
242  * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
243  * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
244  */
245 __kernel void direct_convolution1x1_f32_bifrost(
248  TENSOR3D_DECLARATION(weights),
249 #ifdef HAS_BIAS
250  VECTOR_DECLARATION(biases),
251 #endif /* defined(HAS_BIAS) */
252  unsigned int weights_stride_w)
253 {
254  // Get the kernel index
255  const int kernel_index = get_global_id(2);
256 
259 
260  float4 acc0 = 0.0f;
261  float4 acc1 = 0.0f;
262  float4 acc2 = 0.0f;
263  float4 acc3 = 0.0f;
264 
265  __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
266  __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);
267 
268  for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
269  {
270  // Load the weights
271  float weight = *((__global float *)weights_addr);
272 
273  // Load values from row0 of input tensor
274  float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
275  float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
276  float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
277  float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
278 
279  CONVOLUTION1x1_BIFROST(acc0, src0, weight);
280  CONVOLUTION1x1_BIFROST(acc1, src1, weight);
281  CONVOLUTION1x1_BIFROST(acc2, src2, weight);
282  CONVOLUTION1x1_BIFROST(acc3, src3, weight);
283 
284  src_addr += src_stride_z;
285  weights_addr += weights_stride_z;
286  }
287 
288 #ifdef HAS_BIAS
289  Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
290 
291  float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
292 
293  acc0.s0 += bias;
294  acc0.s1 += bias;
295  acc0.s2 += bias;
296  acc0.s3 += bias;
297  acc1.s0 += bias;
298  acc1.s1 += bias;
299  acc1.s2 += bias;
300  acc1.s3 += bias;
301  acc2.s0 += bias;
302  acc2.s1 += bias;
303  acc2.s2 += bias;
304  acc2.s3 += bias;
305  acc3.s0 += bias;
306  acc3.s1 += bias;
307  acc3.s2 += bias;
308  acc3.s3 += bias;
309 #endif /* defined(HAS_BIAS) */
310 
311  vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
312  vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
313  vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
314  vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
315 }
316 #endif // defined(WEIGHTS_DEPTH)
Structure to hold Vector information.
Definition: helpers.h:666
const size_t weights_stride_z
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
#define CONVERT_TO_IMAGE_STRUCT(name)
Definition: helpers.h:632
#define CONVERT_SAT(a, b)
for(size_t k=0;k< _target.size();++k)
Definition: Unstack.cpp:91
Structure to hold 3D tensor information.
Definition: helpers.h:683
SimpleTensor< float > src
Definition: DFT.cpp:155
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
Definition: helpers.h:651
#define ADD_OP(a, b)
#define VECTOR_DECLARATION(name)
Definition: helpers.h:590
Structure to hold Image information.
Definition: helpers.h:674
#define MUL_OP(a, b)
#define CONVERT_TO_TENSOR3D_STRUCT(name)
Definition: helpers.h:647
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
Definition: helpers.h:850
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:685
#define TENSOR3D_DECLARATION(name)
Definition: helpers.h:604
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
Definition: helpers.h:629
#define VEC_DATA_TYPE(type, size)
Definition: helpers.h:519