28 #define ADD_OP(a, b) ((a) + (b)) 29 #define MUL_OP(a, b) ((a) * (b)) 30 #define CONVERT_SAT(a, b) ((a)) 32 #if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) 35 #define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) 37 #define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) 39 #error "STRIDE_X larger than 2 is not supported" 42 #define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr) \ 44 VEC_DATA_TYPE(DATA_TYPE, 3) \ 45 weights_values0 = vload3(0, weights_row_ptr); \ 46 VEC_DATA_TYPE(DATA_TYPE, 8) \ 47 src0 = vload8(0, src_row_ptr); \ 48 VEC_DATA_TYPE(DATA_TYPE, 2) \ 49 src1 = vload2(0, src_row_ptr + 8); \ 51 acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ 52 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ 53 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ 56 #define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr) \ 58 VEC_DATA_TYPE(DATA_TYPE, 3) \ 59 weights_values0 = vload3(0, weights_row_ptr); \ 60 VEC_DATA_TYPE(DATA_TYPE, 16) \ 61 src0 = vload16(0, src_row_ptr); \ 62 DATA_TYPE src1 = *(src_row_ptr + 16); \ 64 acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0)); \ 65 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \ 66 acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \ 106 __kernel
void direct_convolution3x3(
113 unsigned int weights_stride_w)
122 __global uchar *weights_addr = (__global uchar *)
tensor3D_offset(&weights, 0, 0, 0);
123 __global uchar *src_addr = (__global uchar *)
offset(&src, 0, 0);
125 const int kernel_index = get_global_id(2);
126 weights_addr += kernel_index * weights_stride_w;
128 for(
volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
134 src_addr += src_stride_z;
146 #endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) 148 #if defined(WEIGHTS_DEPTH) 150 #define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \ 152 acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0); \ 153 acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1); \ 154 acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2); \ 155 acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3); \ 156 acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0); \ 157 acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1); \ 158 acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2); \ 159 acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3); \ 160 acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0); \ 161 acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1); \ 162 acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2); \ 163 acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3); \ 202 __kernel
void direct_convolution3x3_f32_bifrost(
209 unsigned int weights_stride_w)
212 const int kernel_index = get_global_id(2);
221 __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
222 __global uchar *src_addr = (__global uchar *)
offset(&src, 0, 0);
226 for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
229 float3 weights_row0 = vload3(0, (__global
float *)(weights_addr + 0 *
weights_stride_y));
230 float3 weights_row1 = vload3(0, (__global
float *)(weights_addr + 1 *
weights_stride_y));
231 float3 weights_row2 = vload3(0, (__global
float *)(weights_addr + 2 *
weights_stride_y));
236 src0 = vload4(0, (__global
float *)(src_addr + 0 * src_stride_y));
237 src1 = vload2(0, (__global
float *)(src_addr + 0 * src_stride_y) + 4);
239 CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
242 src0 = vload4(0, (__global
float *)(src_addr + 1 * src_stride_y));
243 src1 = vload2(0, (__global
float *)(src_addr + 1 * src_stride_y) + 4);
246 CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
247 CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
250 src0 = vload4(0, (__global
float *)(src_addr + 2 * src_stride_y));
251 src1 = vload2(0, (__global
float *)(src_addr + 2 * src_stride_y) + 4);
254 CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
255 CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
256 CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
259 src0 = vload4(0, (__global
float *)(src_addr + 3 * src_stride_y));
260 src1 = vload2(0, (__global
float *)(src_addr + 3 * src_stride_y) + 4);
263 CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
264 CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
267 src0 = vload4(0, (__global
float *)(src_addr + 4 * src_stride_y));
268 src1 = vload2(0, (__global
float *)(src_addr + 4 * src_stride_y) + 4);
271 CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
273 src_addr += src_stride_z;
280 float bias = (float) * ((__global
float *)(
vector_offset(&biases, kernel_index)));
282 values0 += (float4)bias;
283 values1 += (float4)bias;
284 values2 += (float4)bias;
287 vstore4(values0, 0, (__global
float *)(dst.
ptr + 0 * dst_stride_y));
288 vstore4(values1, 0, (__global
float *)(dst.
ptr + 1 * dst_stride_y));
289 vstore4(values2, 0, (__global
float *)(dst.
ptr + 2 * dst_stride_y));
291 #endif // defined(WEIGHTS_DEPTH) Structure to hold Vector information.
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define CONVERT_TO_IMAGE_STRUCT(name)
const size_t weights_stride_y
Structure to hold 3D tensor information.
SimpleTensor< float > src
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
#define VECTOR_DECLARATION(name)
Structure to hold Image information.
#define CONVERT_TO_TENSOR3D_STRUCT(name)
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
const size_t weights_stride_z
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define CONVERT_SAT(a, b)
#define TENSOR3D_DECLARATION(name)
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
__global const uchar * tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
Get the pointer position of a Tensor3D.
#define VEC_DATA_TYPE(type, size)