Compute Library
 22.05
direct_convolution.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 #include "helpers_asymm.h"
26 
27 /** This kernel performs a direct convolution to convolve the low three dimensions.
28  *
29  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
30  * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
31  * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
32  * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
33  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
34  * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
35  * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
36  * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
37  * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
38  *
39  * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
40  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
41  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
42  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
43  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
44  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
45  * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
46  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
47  * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
48  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
49  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
50  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
51  * @param[in] dst_step_y dst_stride_y * number of elements along Z processed per workitem(in bytes)
52  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
53  * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
54  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
55  * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr
56  * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
57  * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
58  * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
59  * @param[in] weights_step_y weights_stride_y * number of elements along y processed per workitem(in bytes)
60  * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
61  * @param[in] weights_step_z weights_stride_z * number of elements along Z processed per workitem(in bytes)
62  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
63  * @param[in] biases_ptr Pointer to the biases tensor. Same as @p src_ptr
64  * @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)
65  * @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)
66  * @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
67  * @param[in] weights_stride_w Stride of the weights tensor in the 4th dimension
68  */
72  TENSOR3D_DECLARATION(weights),
73 #ifdef HAS_BIAS
74  VECTOR_DECLARATION(biases),
75 #endif /* defined(HAS_BIAS) */
76  unsigned int weights_stride_w)
77 {
78  const int id0 = get_global_id(0);
79  const int id1 = get_global_id(1);
80  const int id2 = get_global_id(2);
81 
82  const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
83  const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
84 
85  const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
86 
87  __global uchar *src_addr = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
88  __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
89  __global uchar *dst_addr = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
90 
91 #ifdef IS_QUANTIZED
92  int acc_value = 0;
93 #else /* IS_QUANTIZED */
94  DATA_TYPE acc_value = 0;
95 #endif /* IS_QUANTIZED */
96  for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
97  {
98  for(int y = 0; y < WEI_HEIGHT; ++y)
99  {
100  for(int x = 0; x < WEI_WIDTH; ++x)
101  {
102  const int idx_x = (x_coords + x);
103  const int idx_y = (y_coords + y);
104  if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
105  {
106  const int weight_offset = x + (WEI_HEIGHT * y);
107  const int input_offset = idx_x + SRC_WIDTH * idx_y;
108 #ifdef IS_QUANTIZED
109  int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
110  int input = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
111  acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
112 #else /* IS_QUANTIZED */
113  DATA_TYPE weight = *((__global DATA_TYPE *)weights_addr + weight_offset);
114  DATA_TYPE input = *((__global DATA_TYPE *)src_addr + input_offset);
115  acc_value += input * weight;
116 #endif /* IS_QUANTIZED */
117  }
118  }
119  }
120  src_addr += src_stride_z;
121  weights_addr += weights_stride_z;
122  }
123 
124 #ifdef HAS_BIAS
125 
126  Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
127 #ifdef IS_QUANTIZED
128  int bias = *((__global int *)(vector_offset(&biases, id2)));
129 #else /* IS_QUANTIZED */
130  DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2)));
131 #endif /* IS_QUANTIZED */
132  acc_value += bias;
133 
134 #endif /* defined(HAS_BIAS) */
135 
136 #ifdef IS_QUANTIZED
137 
138 #if OUTPUT_SHIFT < 0
139  acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
140 #else // OUTPUT_SHIFT < 0
141  acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
142 #endif // OUTPUT_SHIFT < 0
143  acc_value = acc_value + OUTPUT_OFFSET;
144 #endif /* IS_QUANTIZED */
145 
146  *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE);
147 }
Structure to hold Vector information.
Definition: helpers.h:888
#define VEC_SIZE
__kernel void direct_convolution_nchw(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *weights_ptr, uint weights_stride_x, uint weights_step_x, uint weights_stride_y, uint weights_step_y, uint weights_stride_z, uint weights_step_z, uint weights_offset_first_element_in_bytes, __global uchar *biases_ptr, uint biases_stride_x, uint biases_step_x, uint biases_offset_first_element_in_bytes, unsigned int weights_stride_w)
This kernel performs a direct convolution to convolve the low three dimensions.
SimpleTensor< float > src
Definition: DFT.cpp:155
#define VECTOR_DECLARATION(name)
Definition: helpers.h:798
#define CONVERT_SAT(x, type)
Definition: helpers.h:733
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
Definition: helpers.h:1072
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
const size_t weights_stride_z
Definition: impl.cpp:56
#define TENSOR3D_DECLARATION(name)
Definition: helpers.h:812
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
Definition: helpers.h:851
const int32_t * bias