28 #define ADD_OP(a, b) ((a) + (b)) 29 #define MUL_OP(a, b) ((a) * (b)) 30 #define CONVERT_SAT(a, b) ((a)) 32 #if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) 35 #define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size 36 #define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size) 38 #define INPUT_PIXEL(data_size) extract_input_stride2 40 #define INPUT_PIXEL(data_size) extract_input_stride1 42 #error "Only support strides 1, 2 and 3" 51 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
53 return vload8(0, input_pixel);
62 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global
const DATA_TYPE *input_pixel)
65 temp = vload16(0, input_pixel);
66 return temp.s02468ace;
75 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global
const DATA_TYPE *input_pixel)
78 temp1 = vload4(0, input_pixel);
80 temp2 = vload4(0, input_pixel + 6);
82 temp3 = vload4(0, input_pixel + 12);
84 temp4 = vload4(0, input_pixel + 18);
85 return (
VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
94 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global
const DATA_TYPE *input_pixel)
97 temp1 = vload8(0, input_pixel);
99 temp2 = vload8(0, input_pixel + 8);
101 temp3 = vload8(0, input_pixel + 16);
102 return (
VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
111 inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global
const DATA_TYPE *input_pixel)
114 temp1 = vload16(0, input_pixel);
116 temp2 = vload16(0, input_pixel + 12);
117 return (
VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
158 __kernel
void direct_convolution1x1(
165 unsigned int weights_stride_w)
178 const uint z_index = get_global_id(2);
180 weights.
ptr += z_index * weights_stride_w;
181 for(
volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
183 DATA_TYPE weight = *(__global DATA_TYPE *)weights.
ptr;
185 input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.
ptr);
187 src.
ptr += src_stride_z;
197 #endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) 199 #if defined(WEIGHTS_DEPTH) 201 #define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \ 203 acc.s0 = mad(src.s0, weight_value, acc.s0); \ 204 acc.s1 = mad(src.s1, weight_value, acc.s1); \ 205 acc.s2 = mad(src.s2, weight_value, acc.s2); \ 206 acc.s3 = mad(src.s3, weight_value, acc.s3); \ 245 __kernel
void direct_convolution1x1_f32_bifrost(
252 unsigned int weights_stride_w)
255 const int kernel_index = get_global_id(2);
265 __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
266 __global uchar *src_addr = (__global uchar *)
offset(&src, 0, 0);
268 for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
271 float weight = *((__global
float *)weights_addr);
274 float4 src0 = vload4(0, (__global
float *)(src_addr + 0 * src_stride_y));
275 float4 src1 = vload4(0, (__global
float *)(src_addr + 1 * src_stride_y));
276 float4 src2 = vload4(0, (__global
float *)(src_addr + 2 * src_stride_y));
277 float4 src3 = vload4(0, (__global
float *)(src_addr + 3 * src_stride_y));
279 CONVOLUTION1x1_BIFROST(acc0, src0, weight);
280 CONVOLUTION1x1_BIFROST(acc1, src1, weight);
281 CONVOLUTION1x1_BIFROST(acc2, src2, weight);
282 CONVOLUTION1x1_BIFROST(acc3, src3, weight);
284 src_addr += src_stride_z;
291 float bias = (float) * ((__global
float *)(
vector_offset(&biases, kernel_index)));
311 vstore4(acc0, 0, (__global
float *)(dst.
ptr + 0 * dst_stride_y));
312 vstore4(acc1, 0, (__global
float *)(dst.
ptr + 1 * dst_stride_y));
313 vstore4(acc2, 0, (__global
float *)(dst.
ptr + 2 * dst_stride_y));
314 vstore4(acc3, 0, (__global
float *)(dst.
ptr + 3 * dst_stride_y));
316 #endif // defined(WEIGHTS_DEPTH) Structure to hold Vector information.
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
const size_t weights_stride_z
#define CONVERT_TO_IMAGE_STRUCT(name)
#define CONVERT_SAT(a, b)
Structure to hold 3D tensor information.
SimpleTensor< float > src
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
#define VECTOR_DECLARATION(name)
Structure to hold Image information.
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define CONVERT_TO_TENSOR3D_STRUCT(name)
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define TENSOR3D_DECLARATION(name)
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
#define VEC_DATA_TYPE(type, size)