Compute Library
 23.08
direct_convolution.cl File Reference
#include "helpers.h"
#include "helpers_asymm.h"

Go to the source code of this file.

Functions

__kernel void direct_convolution_nchw (__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *weights_ptr, uint weights_stride_x, uint weights_step_x, uint weights_stride_y, uint weights_step_y, uint weights_stride_z, uint weights_step_z, uint weights_offset_first_element_in_bytes, __global uchar *biases_ptr, uint biases_stride_x, uint biases_step_x, uint biases_offset_first_element_in_bytes, unsigned int weights_stride_w)
 This kernel performs a direct convolution to convolve the low three dimensions. More...
 

Function Documentation

◆ direct_convolution_nchw()

__kernel void direct_convolution_nchw ( __global uchar *  src_ptr,
uint  src_stride_x,
uint  src_step_x,
uint  src_stride_y,
uint  src_step_y,
uint  src_stride_z,
uint  src_step_z,
uint  src_offset_first_element_in_bytes,
__global uchar *  dst_ptr,
uint  dst_stride_x,
uint  dst_step_x,
uint  dst_stride_y,
uint  dst_step_y,
uint  dst_stride_z,
uint  dst_step_z,
uint  dst_offset_first_element_in_bytes,
__global uchar *  weights_ptr,
uint  weights_stride_x,
uint  weights_step_x,
uint  weights_stride_y,
uint  weights_step_y,
uint  weights_stride_z,
uint  weights_step_z,
uint  weights_offset_first_element_in_bytes,
__global uchar *  biases_ptr,
uint  biases_stride_x,
uint  biases_step_x,
uint  biases_offset_first_element_in_bytes,
unsigned int  weights_stride_w 
)

This kernel performs a direct convolution to convolve the low three dimensions.

Note
The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
Parameters
[in]src_ptrPointer to the source tensor. Supported data types: F16/F32
[in]src_stride_xStride of the source tensor in X dimension (in bytes)
[in]src_step_xsrc_stride_x * number of elements along X processed per workitem(in bytes)
[in]src_stride_yStride of the source tensor in Y dimension (in bytes)
[in]src_step_ysrc_stride_y * number of elements along Y processed per workitem(in bytes)
[in]src_stride_zStride of the source tensor in Z dimension (in bytes)
[in]src_step_zsrc_stride_z * number of elements along Z processed per workitem(in bytes)
[in]src_offset_first_element_in_bytesThe offset of the first element in the source tensor
[out]dst_ptrPointer to the destination tensor. Supported data types: same as src_ptr
[in]dst_stride_xStride of the destination tensor in X dimension (in bytes)
[in]dst_step_xdst_stride_x * number of elements along X processed per workitem(in bytes)
[in]dst_stride_yStride of the destination tensor in Y dimension (in bytes)
[in]dst_step_ydst_stride_y * number of elements along Z processed per workitem(in bytes)
[in]dst_stride_zStride of the destination tensor in Z dimension (in bytes)
[in]dst_step_zdst_stride_z * number of elements along Z processed per workitem(in bytes)
[in]dst_offset_first_element_in_bytesThe offset of the first element in the destination tensor
[in]weights_ptrPointer to the weights tensor. Supported data types: same as src_ptr
[in]weights_stride_xStride of the weights tensor in X dimension (in bytes)
[in]weights_step_xweights_stride_x * number of elements along X processed per workitem(in bytes)
[in]weights_stride_yStride of the weights tensor in Y dimension (in bytes)
[in]weights_step_yweights_stride_y * number of elements along y processed per workitem(in bytes)
[in]weights_stride_zStride of the weights tensor in Z dimension (in bytes)
[in]weights_step_zweights_stride_z * number of elements along Z processed per workitem(in bytes)
[in]weights_offset_first_element_in_bytesThe offset of the first element in the weights tensor
[in]biases_ptrPointer to the biases tensor. Same as src_ptr
[in]biases_stride_xStride of the biases tensor in X dimension (in bytes)
[in]biases_step_xbiases_stride_x * number of elements along X processed per workitem(in bytes)
[in]biases_offset_first_element_in_bytesThe offset of the first element in the biases tensor
[in]weights_stride_wStride of the weights tensor in the 4th dimension

Definition at line 69 of file direct_convolution.cl.

77 {
78  const int id0 = get_global_id(0);
79  const int id1 = get_global_id(1);
80  const int id2 = get_global_id(2);
81 
82  const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
83  const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
84 
85  const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
86 
87  __global uchar *src_addr = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
88  __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
89  __global uchar *dst_addr = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
90 
91 #ifdef IS_QUANTIZED
92  int acc_value = 0;
93 #else /* IS_QUANTIZED */
94  DATA_TYPE acc_value = 0;
95 #endif /* IS_QUANTIZED */
96  for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
97  {
98  for(int y = 0; y < WEI_HEIGHT; ++y)
99  {
100  for(int x = 0; x < WEI_WIDTH; ++x)
101  {
102  const int idx_x = (x_coords + x);
103  const int idx_y = (y_coords + y);
104  if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
105  {
106  const int weight_offset = x + (WEI_HEIGHT * y);
107  const int input_offset = idx_x + SRC_WIDTH * idx_y;
108 #ifdef IS_QUANTIZED
109  int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
110  int input = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
111  acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
112 #else /* IS_QUANTIZED */
113  DATA_TYPE weight = *((__global DATA_TYPE *)weights_addr + weight_offset);
114  DATA_TYPE input = *((__global DATA_TYPE *)src_addr + input_offset);
115  acc_value += input * weight;
116 #endif /* IS_QUANTIZED */
117  }
118  }
119  }
120  src_addr += src_stride_z;
121  weights_addr += weights_stride_z;
122  }
123 
124 #ifdef HAS_BIAS
125 
126  Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
127 #ifdef IS_QUANTIZED
128  int bias = *((__global int *)(vector_offset(&biases, id2)));
129 #else /* IS_QUANTIZED */
130  DATA_TYPE bias = *((__global DATA_TYPE *)(vector_offset(&biases, id2)));
131 #endif /* IS_QUANTIZED */
132  acc_value += bias;
133 
134 #endif /* defined(HAS_BIAS) */
135 
136 #ifdef IS_QUANTIZED
137 
138 #if OUTPUT_SHIFT < 0
139  acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
140 #else // OUTPUT_SHIFT < 0
141  acc_value = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(acc_value, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 1);
142 #endif // OUTPUT_SHIFT < 0
143  acc_value = acc_value + OUTPUT_OFFSET;
144 #endif /* IS_QUANTIZED */
145 
146  *(__global DATA_TYPE *)dst_addr = CONVERT_SAT(acc_value, DATA_TYPE);
147 }

References ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, bias, CONVERT_SAT, CONVERT_TO_VECTOR_STRUCT_NO_STEP, arm_compute::test::validation::input, VEC_SIZE, vector_offset(), and weights_stride_z.

Vector
Structure to hold Vector information.
Definition: helpers.h:917
CONVERT_SAT
#define CONVERT_SAT(x, type)
Definition: helpers.h:762
VEC_SIZE
#define VEC_SIZE
Definition: qlstm_layer_normalization.cl:54
CONVERT_TO_VECTOR_STRUCT_NO_STEP
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
Definition: helpers.h:880
ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
Definition: helpers_asymm.h:399
bias
const int32_t * bias
Definition: working_space.hpp:322
vector_offset
const __global uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
Definition: helpers.h:1101
weights_stride_z
const size_t weights_stride_z
Definition: impl.cpp:57
arm_compute::test::validation::input
auto input
Definition: LSTMLayerQuantized.cpp:486
ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size)
Definition: helpers_asymm.h:397