76 unsigned int weights_stride_w)
78 const int id0 = get_global_id(0);
79 const int id1 = get_global_id(1);
80 const int id2 = get_global_id(2);
82 const int x_coords = (id0 * STRIDE_X) - PAD_LEFT;
83 const int y_coords = (id1 * STRIDE_Y) - PAD_TOP;
85 const int x_offs = max((
int)(get_global_id(0) *
VEC_SIZE - (
VEC_SIZE - VEC_SIZE_LEFTOVER) %
VEC_SIZE), 0) *
sizeof(DATA_TYPE);
87 __global uchar *src_addr = (__global uchar *)(src_ptr + src_offset_first_element_in_bytes);
88 __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + id2 * weights_stride_w);
89 __global uchar *dst_addr = (__global uchar *)dst_ptr + dst_offset_first_element_in_bytes + x_offs + id1 * dst_stride_y + id2 * dst_stride_z;
94 DATA_TYPE acc_value = 0;
96 for(
volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
98 for(
int y = 0; y < WEI_HEIGHT; ++y)
100 for(
int x = 0; x < WEI_WIDTH; ++x)
102 const int idx_x = (x_coords + x);
103 const int idx_y = (y_coords + y);
104 if((idx_x >= 0 && idx_x < SRC_WIDTH) && (idx_y >= 0 && idx_y < SRC_HEIGHT))
106 const int weight_offset = x + (WEI_HEIGHT * y);
107 const int input_offset = idx_x + SRC_WIDTH * idx_y;
109 int weight = convert_int(*((__global DATA_TYPE *)weights_addr + weight_offset));
110 int input = convert_int(*((__global DATA_TYPE *)src_addr + input_offset));
111 acc_value += (input + INPUT_OFFSET) * (weight + WEIGHTS_OFFSET);
113 DATA_TYPE weight = *((__global DATA_TYPE *)weights_addr + weight_offset);
114 DATA_TYPE input = *((__global DATA_TYPE *)src_addr + input_offset);
115 acc_value += input * weight;
120 src_addr += src_stride_z;
130 DATA_TYPE bias = *((__global DATA_TYPE *)(
vector_offset(&biases, id2)));
140 #else // OUTPUT_SHIFT < 0 142 #endif // OUTPUT_SHIFT < 0 143 acc_value = acc_value + OUTPUT_OFFSET;
146 *(__global DATA_TYPE *)dst_addr =
CONVERT_SAT(acc_value, DATA_TYPE);
Structure to hold Vector information.
__kernel void direct_convolution_nchw(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *weights_ptr, uint weights_stride_x, uint weights_step_x, uint weights_stride_y, uint weights_step_y, uint weights_stride_z, uint weights_step_z, uint weights_offset_first_element_in_bytes, __global uchar *biases_ptr, uint biases_stride_x, uint biases_step_x, uint biases_offset_first_element_in_bytes, unsigned int weights_stride_w)
This kernel performs a direct convolution to convolve the low three dimensions.
SimpleTensor< float > src
#define VECTOR_DECLARATION(name)
#define CONVERT_SAT(x, type)
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
const size_t weights_stride_z
#define TENSOR3D_DECLARATION(name)
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)