38 inline __global uchar *
ptr_offset(__global uchar *ptr,
const int x,
const int y,
const int stride_x,
const int stride_y)
40 return ptr + x * stride_x + y * stride_y;
43 #if(DILATION_X == 1 && DILATION_Y == 1) 45 #define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0, weights_row0) \ 47 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \ 48 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \ 49 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \ 50 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \ 51 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \ 52 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \ 55 #define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0, weights_row0) \ 57 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \ 58 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \ 59 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \ 60 acc.s1 = fma(src0.s1, weights_row0.s0, acc.s1); \ 61 acc.s1 = fma(src0.s2, weights_row0.s1, acc.s1); \ 62 acc.s1 = fma(src0.s3, weights_row0.s2, acc.s1); \ 63 acc.s2 = fma(src0.s2, weights_row0.s0, acc.s2); \ 64 acc.s2 = fma(src0.s3, weights_row0.s1, acc.s2); \ 65 acc.s2 = fma(src0.s4, weights_row0.s2, acc.s2); \ 66 acc.s3 = fma(src0.s3, weights_row0.s0, acc.s3); \ 67 acc.s3 = fma(src0.s4, weights_row0.s1, acc.s3); \ 68 acc.s3 = fma(src0.s5, weights_row0.s2, acc.s3); \ 71 #define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0, src1, weights_row0) \ 73 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \ 74 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \ 75 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \ 76 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \ 77 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \ 78 acc.s1 = fma(src1.s0, weights_row0.s2, acc.s1); \ 81 #define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0, src1, weights_row0) \ 83 acc.s0 = fma(src0.s0, weights_row0.s0, acc.s0); \ 84 acc.s0 = fma(src0.s1, weights_row0.s1, acc.s0); \ 85 acc.s0 = fma(src0.s2, weights_row0.s2, acc.s0); \ 86 acc.s1 = fma(src0.s2, weights_row0.s0, acc.s1); \ 87 acc.s1 = fma(src0.s3, weights_row0.s1, acc.s1); \ 88 acc.s1 = fma(src0.s4, weights_row0.s2, acc.s1); \ 89 acc.s2 = fma(src0.s4, weights_row0.s0, acc.s2); \ 90 acc.s2 = fma(src0.s5, weights_row0.s1, acc.s2); \ 91 acc.s2 = fma(src0.s6, weights_row0.s2, acc.s2); \ 92 acc.s3 = fma(src0.s6, weights_row0.s0, acc.s3); \ 93 acc.s3 = fma(src0.s7, weights_row0.s1, acc.s3); \ 94 acc.s3 = fma(src1.s0, weights_row0.s2, acc.s3); \ 99 #define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \ 101 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \ 102 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \ 103 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \ 104 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \ 105 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \ 106 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \ 109 #define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \ 111 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \ 112 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \ 113 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \ 114 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \ 115 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \ 116 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \ 119 #define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0) \ 121 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \ 122 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \ 123 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \ 124 acc.s1 = fma(src0_left.s1, weights_row0.s0, acc.s1); \ 125 acc.s1 = fma(src0_mid.s1, weights_row0.s1, acc.s1); \ 126 acc.s1 = fma(src0_right.s1, weights_row0.s2, acc.s1); \ 127 acc.s2 = fma(src0_left.s2, weights_row0.s0, acc.s2); \ 128 acc.s2 = fma(src0_mid.s2, weights_row0.s1, acc.s2); \ 129 acc.s2 = fma(src0_right.s2, weights_row0.s2, acc.s2); \ 130 acc.s3 = fma(src0_left.s3, weights_row0.s0, acc.s3); \ 131 acc.s3 = fma(src0_mid.s3, weights_row0.s1, acc.s3); \ 132 acc.s3 = fma(src0_right.s3, weights_row0.s2, acc.s3); \ 135 #define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0) \ 137 acc.s0 = fma(src0_left.s0, weights_row0.s0, acc.s0); \ 138 acc.s0 = fma(src0_mid.s0, weights_row0.s1, acc.s0); \ 139 acc.s0 = fma(src0_right.s0, weights_row0.s2, acc.s0); \ 140 acc.s1 = fma(src0_left.s2, weights_row0.s0, acc.s1); \ 141 acc.s1 = fma(src0_mid.s2, weights_row0.s1, acc.s1); \ 142 acc.s1 = fma(src0_right.s2, weights_row0.s2, acc.s1); \ 143 acc.s2 = fma(src0_left.s4, weights_row0.s0, acc.s2); \ 144 acc.s2 = fma(src0_mid.s4, weights_row0.s1, acc.s2); \ 145 acc.s2 = fma(src0_right.s4, weights_row0.s2, acc.s2); \ 146 acc.s3 = fma(src0_left.s6, weights_row0.s0, acc.s3); \ 147 acc.s3 = fma(src0_mid.s6, weights_row0.s1, acc.s3); \ 148 acc.s3 = fma(src0_right.s6, weights_row0.s2, acc.s3); \ 153 #if defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32) 154 #if defined(CONV_STRIDE_X) 156 #if CONV_STRIDE_X == 1 157 #define convolution1x3 convolution1x3_stride_1 158 #elif CONV_STRIDE_X == 2 159 #define convolution1x3 convolution1x3_stride_2 160 #elif CONV_STRIDE_X == 3 161 #define convolution1x3 convolution1x3_stride_3 163 #error "Stride not supported" 175 inline float2 convolution1x3_stride_1(__global
const uchar *left_pixel,
176 const float left_coeff,
177 const float middle_coeff,
178 const float right_coeff)
180 #if(DILATION_X == 1 && DILATION_Y == 1) 181 float4 temp = vload4(0, (__global
float *)left_pixel);
183 float2 left =
CONVERT(temp.s01, float2);
184 float2 middle =
CONVERT(temp.s12, float2);
185 float2 right =
CONVERT(temp.s23, float2);
186 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
188 return vload2(0, (__global
float *)left_pixel) * (float2)left_coeff
189 + vload2(0, (__global
float *)(left_pixel) + DILATION_X) * (float2)middle_coeff
190 + vload2(0, (__global
float *)(left_pixel) + 2 * DILATION_X) * (float2)right_coeff;
203 inline float2 convolution1x3_stride_2(__global
const uchar *left_pixel,
204 const float left_coeff,
205 const float middle_coeff,
206 const float right_coeff)
208 #if(DILATION_X == 1 && DILATION_Y == 1) 209 float4 temp0 = vload4(0, (__global
float *)left_pixel);
210 float temp1 = *((__global
float *)(left_pixel + 4 *
sizeof(
float)));
212 float2 left =
CONVERT(temp0.s02, float2);
213 float2 middle =
CONVERT(temp0.s13, float2);
214 float2 right =
CONVERT((float2)(temp0.s2, temp1), float2);
216 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
218 __global
float *left_pixel_float = (__global
float *)left_pixel;
220 return vload4(0, left_pixel_float).s02 * (float2)left_coeff
221 + vload4(0, left_pixel_float + DILATION_X).s02 * (float2)middle_coeff
222 + vload4(0, left_pixel_float + DILATION_X * 2).s02 * (float2)right_coeff;
236 inline float2 convolution1x3_stride_3(__global
const uchar *left_pixel,
237 const float left_coeff,
238 const float middle_coeff,
239 const float right_coeff)
241 #if(DILATION_X == 1 && DILATION_Y == 1) 242 float4 temp0 = vload4(0, (__global
float *)left_pixel);
243 float2 temp1 = vload2(0, (__global
float *)(left_pixel + 4 *
sizeof(
float)));
245 float2 left =
CONVERT(temp0.s03, float2);
246 float2 middle =
CONVERT((float2)(temp0.s1, temp1.s0), float2);
247 float2 right =
CONVERT((float2)(temp0.s2, temp1.s1), float2);
249 return left * (float2)left_coeff + middle * (float2)middle_coeff + right * (float2)right_coeff;
251 __global
float *left_pixel_float = (__global
float *)left_pixel;
253 return (float2)(*left_pixel_float, *(left_pixel_float + 3)) * (float2)left_coeff
254 + (float2)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3)) * (float2)middle_coeff
255 + (float2)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3)) * (float2)right_coeff;
282 __global
const uchar *
src,
283 unsigned int src_stride_y,
284 const float mat0,
const float mat1,
const float mat2,
285 const float mat3,
const float mat4,
const float mat5,
286 const float mat6,
const float mat7,
const float mat8)
290 pixels =
convolution1x3((
src + 0 * DILATION_Y * src_stride_y), mat0, mat1, mat2);
291 pixels +=
convolution1x3((
src + 1 * DILATION_Y * src_stride_y), mat3, mat4, mat5);
292 pixels +=
convolution1x3((
src + 2 * DILATION_Y * src_stride_y), mat6, mat7, mat8);
331 __kernel
void depthwise_convolution_3x3(
335 #
if defined(HAS_BIAS)
344 float2 pixels = 0.0f;
347 const int channel = get_global_id(2) % DST_CHANNELS;
348 const int batch = get_global_id(2) / DST_CHANNELS;
351 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
353 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
354 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
357 float3 weights_values0 = vload3(0, (__global
float *)(weights_addr + 0 *
weights_stride_y));
358 float3 weights_values1 = vload3(0, (__global
float *)(weights_addr + 1 *
weights_stride_y));
359 float3 weights_values2 = vload3(0, (__global
float *)(weights_addr + 2 *
weights_stride_y));
362 weights_values0.s0, weights_values0.s1, weights_values0.s2,
363 weights_values1.s0, weights_values1.s1, weights_values1.s2,
364 weights_values2.s0, weights_values2.s1, weights_values2.s2);
365 #if defined(HAS_BIAS) 368 float bias = *((__global
float *)(
vector_offset(&biases, channel)));
370 pixels += (float2)bias;
371 #endif //defined(HAS_BIAS) 375 #endif //defined(CONV_STRIDE_X) 377 #if(DILATION_X > 1 || DILATION_Y > 1) 388 inline float2 convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(__global uchar *src_addr,
const int stride_x_bytes,
const int stride_y_bytes,
389 const int y_offset, __global uchar *weights_addr,
const int weights_stride_y)
392 float3 weights_row0 = vload3(0, (__global
float *)(weights_addr + 0 * weights_stride_y));
393 float3 weights_row1 = vload3(0, (__global
float *)(weights_addr + 1 * weights_stride_y));
394 float3 weights_row2 = vload3(0, (__global
float *)(weights_addr + 2 * weights_stride_y));
396 float2 pixels0 = 0.0f;
398 float2 src00_left = vload2(0, (__global
float *)
ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes));
399 float2 src00_mid = vload2(0, (__global
float *)
ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
400 float2 src00_right = vload2(0, (__global
float *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
402 float2 src10_left = vload2(0, (__global
float *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
403 float2 src10_mid = vload2(0, (__global
float *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
404 float2 src10_right = vload2(0, (__global
float *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
406 float2 src20_left = vload2(0, (__global
float *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
407 float2 src20_mid = vload2(0, (__global
float *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
408 float2 src20_right = vload2(0, (__global
float *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
426 inline float2 convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(__global uchar *src_addr,
const int stride_x_bytes,
const int stride_y_bytes,
427 const int y_offset, __global uchar *weights_addr,
const int weights_stride_y)
430 float3 weights_row0 = vload3(0, (__global
float *)(weights_addr + 0 * weights_stride_y));
431 float3 weights_row1 = vload3(0, (__global
float *)(weights_addr + 1 * weights_stride_y));
432 float3 weights_row2 = vload3(0, (__global
float *)(weights_addr + 2 * weights_stride_y));
434 float2 pixels0 = 0.0f;
436 float3 src00_left = vload3(0, (__global
float *)
ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes));
437 float3 src00_mid = vload3(0, (__global
float *)
ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
438 float3 src00_right = vload3(0, (__global
float *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
440 float3 src10_left = vload3(0, (__global
float *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
441 float3 src10_mid = vload3(0, (__global
float *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
442 float3 src10_right = vload3(0, (__global
float *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
444 float3 src20_left = vload3(0, (__global
float *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
445 float3 src20_mid = vload3(0, (__global
float *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
446 float3 src20_right = vload3(0, (__global
float *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
494 __kernel
void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f32(
498 #
if defined(HAS_BIAS)
507 float2 pixels0 = 0.0f;
508 float2 pixels1 = 0.0f;
509 float2 pixels2 = 0.0f;
510 float2 pixels3 = 0.0f;
513 const int channel = get_global_id(2) % DST_CHANNELS;
514 const int batch = get_global_id(2) / DST_CHANNELS;
516 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
517 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
518 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
520 #if(DILATION_X == 1 && DILATION_Y == 1) 522 float3 weights_row0 = vload3(0, (__global
float *)(weights_addr + 0 *
weights_stride_y));
523 float3 weights_row1 = vload3(0, (__global
float *)(weights_addr + 1 *
weights_stride_y));
524 float3 weights_row2 = vload3(0, (__global
float *)(weights_addr + 2 *
weights_stride_y));
527 float4 src00 = vload4(0, (__global
float *)(src_addr + 0 * src_stride_y));
528 float4 src10 = vload4(0, (__global
float *)(src_addr + 1 * src_stride_y));
529 float4 src20 = vload4(0, (__global
float *)(src_addr + 2 * src_stride_y));
530 float4 src30 = vload4(0, (__global
float *)(src_addr + 3 * src_stride_y));
531 float4 src40 = vload4(0, (__global
float *)(src_addr + 4 * src_stride_y));
532 float4 src50 = vload4(0, (__global
float *)(src_addr + 5 * src_stride_y));
550 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr,
weights_stride_y);
552 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src_stride_x, src_stride_y, 1, weights_addr,
weights_stride_y);
554 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr,
weights_stride_y);
556 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src_stride_x, src_stride_y, 3, weights_addr,
weights_stride_y);
563 float bias = *((__global
float *)(
vector_offset(&biases, channel)));
565 pixels0 += (float2)bias;
566 pixels1 += (float2)bias;
567 pixels2 += (float2)bias;
568 pixels3 += (float2)bias;
614 __kernel
void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(
618 #
if defined(HAS_BIAS)
627 float2 pixels0 = 0.0f;
628 float2 pixels1 = 0.0f;
631 const int channel = get_global_id(2) % DST_CHANNELS;
632 const int batch = get_global_id(2) / DST_CHANNELS;
634 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
635 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
636 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
638 #if(DILATION_X == 1 && DILATION_Y == 1) 641 float3 weights_row0 = vload3(0, (__global
float *)(weights_addr + 0 *
weights_stride_y));
642 float3 weights_row1 = vload3(0, (__global
float *)(weights_addr + 1 *
weights_stride_y));
643 float3 weights_row2 = vload3(0, (__global
float *)(weights_addr + 2 *
weights_stride_y));
646 float4 src00 = vload4(0, (__global
float *)(src_addr + 0 * src_stride_y));
647 float2 src01 = vload2(2, (__global
float *)(src_addr + 0 * src_stride_y));
648 float4 src10 = vload4(0, (__global
float *)(src_addr + 1 * src_stride_y));
649 float2 src11 = vload2(2, (__global
float *)(src_addr + 1 * src_stride_y));
650 float4 src20 = vload4(0, (__global
float *)(src_addr + 2 * src_stride_y));
651 float2 src21 = vload2(2, (__global
float *)(src_addr + 2 * src_stride_y));
652 float4 src30 = vload4(0, (__global
float *)(src_addr + 3 * src_stride_y));
653 float2 src31 = vload2(2, (__global
float *)(src_addr + 3 * src_stride_y));
654 float4 src40 = vload4(0, (__global
float *)(src_addr + 4 * src_stride_y));
655 float2 src41 = vload2(2, (__global
float *)(src_addr + 4 * src_stride_y));
667 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr,
weights_stride_y);
669 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr,
weights_stride_y);
675 float bias = *((__global
float *)(
vector_offset(&biases, channel)));
677 pixels0 += (float2)bias;
678 pixels1 += (float2)bias;
685 #endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32) 687 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH) 710 __kernel
void depthwise_convolution_reshape_weights(
715 const int x = get_global_id(0);
737 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH *
sizeof(
DATA_TYPE);
739 #if defined(TRANSPOSE) 741 #error "VEC_SIZE not supported" 742 #else // VEC_SIZE != 4 761 #endif // VEC_SIZE != 4 762 #else // !defined(TRANSPOSE) 764 (w0, 0, dst_addr + 0);
781 #endif // defined(TRANSPOSE) 783 #endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH) 785 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16) 786 #if defined(CONV_STRIDE_X) 787 #if CONV_STRIDE_X == 1 788 #define convolution1x3_f16 convolution1x3_stride_1_f16 789 #elif CONV_STRIDE_X == 2 790 #define convolution1x3_f16 convolution1x3_stride_2_f16 791 #elif CONV_STRIDE_X == 3 792 #define convolution1x3_f16 convolution1x3_stride_3_f16 794 #error "Stride not supported" 797 #if(DILATION_X > 1 || DILATION_Y > 1) 808 inline half4 convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(__global uchar *src_addr,
const int stride_x_bytes,
const int stride_y_bytes,
809 const int y_offset, __global uchar *weights_addr,
const int weights_stride_y)
812 half3 weights_row0 = vload3(0, (__global
half *)(weights_addr + 0 * weights_stride_y));
813 half3 weights_row1 = vload3(0, (__global
half *)(weights_addr + 1 * weights_stride_y));
814 half3 weights_row2 = vload3(0, (__global
half *)(weights_addr + 2 * weights_stride_y));
816 half4 pixels0 = 0.0f;
818 half4 src00_left = vload4(0, (__global
half *)
ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes));
819 half4 src00_mid = vload4(0, (__global
half *)
ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
820 half4 src00_right = vload4(0, (__global
half *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
822 half4 src10_left = vload4(0, (__global
half *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
823 half4 src10_mid = vload4(0, (__global
half *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
824 half4 src10_right = vload4(0, (__global
half *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
826 half4 src20_left = vload4(0, (__global
half *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
827 half4 src20_mid = vload4(0, (__global
half *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
828 half4 src20_right = vload4(0, (__global
half *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
846 inline half4 convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(__global uchar *src_addr,
const int stride_x_bytes,
const int stride_y_bytes,
847 const int y_offset, __global uchar *weights_addr,
const int weights_stride_y)
850 half3 weights_row0 = vload3(0, (__global
half *)(weights_addr + 0 * weights_stride_y));
851 half3 weights_row1 = vload3(0, (__global
half *)(weights_addr + 1 * weights_stride_y));
852 half3 weights_row2 = vload3(0, (__global
half *)(weights_addr + 2 * weights_stride_y));
854 half4 pixels0 = 0.0f;
856 half8 src00_left = vload8(0, (__global
half *)
ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes));
857 half8 src00_mid = vload8(0, (__global
half *)
ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
858 half8 src00_right = vload8(0, (__global
half *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));
860 half8 src10_left = vload8(0, (__global
half *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
861 half8 src10_mid = vload8(0, (__global
half *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
862 half8 src10_right = vload8(0, (__global
half *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));
864 half8 src20_left = vload8(0, (__global
half *)
ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
865 half8 src20_mid = vload8(0, (__global
half *)
ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
866 half8 src20_right = vload8(0, (__global
half *)
ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));
875 #endif // (DILATION_X > 1 && DILATION_Y > 1) 886 inline half4 convolution1x3_stride_1_f16(__global
const uchar *left_pixel,
887 const half left_coeff,
888 const half middle_coeff,
889 const half right_coeff)
891 #if(DILATION_X == 1 && DILATION_Y == 1) 893 half8 temp = vload8(0, (__global
half *)left_pixel);
895 half4 left =
CONVERT(temp.s0123, half4);
896 half4 middle =
CONVERT(temp.s1234, half4);
897 half4 right =
CONVERT(temp.s2345, half4);
899 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
901 return vload4(0, (__global
half *)left_pixel) * (half4)left_coeff
902 + vload4(0, (__global
half *)(left_pixel) + DILATION_X) * (half4)middle_coeff
903 + vload4(0, (__global
half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;
917 inline half4 convolution1x3_stride_2_f16(__global
const uchar *left_pixel,
918 const half left_coeff,
919 const half middle_coeff,
920 const half right_coeff)
922 #if(DILATION_X == 1 && DILATION_Y == 1) 924 half8 temp0 = vload8(0, (__global
half *)left_pixel);
925 half temp1 = *((__global
half *)(left_pixel + 8 *
sizeof(
half)));
927 half4 left =
CONVERT(temp0.s0246, half4);
928 half4 middle =
CONVERT(temp0.s1357, half4);
929 half4 right =
CONVERT((half4)(temp0.s246, temp1), half4);
931 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
934 __global
half *left_pixel_float = (__global
half *)left_pixel;
936 return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff
937 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff
938 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;
952 inline half4 convolution1x3_stride_3_f16(__global
const uchar *left_pixel,
953 const half left_coeff,
954 const half middle_coeff,
955 const half right_coeff)
957 #if(DILATION_X == 1 && DILATION_Y == 1) 959 half16 temp0 = vload16(0, (__global
half *)left_pixel);
961 half4 left =
CONVERT(temp0.s0369, half4);
962 half4 middle =
CONVERT(temp0.s147A, half4);
963 half4 right =
CONVERT(temp0.s258B, half4);
965 return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;
968 __global
half *left_pixel_float = (__global
half *)left_pixel;
970 return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff
971 + (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff
972 + (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;
999 inline half4 convolution3x3_f16(
1000 __global uchar *
src, uint src_stride_y,
1007 pixels = convolution1x3_f16(
src, mat0, mat1, mat2);
1008 pixels += convolution1x3_f16(
src + DILATION_Y * src_stride_y, mat3, mat4, mat5);
1009 pixels += convolution1x3_f16(
src + DILATION_Y * 2 * src_stride_y, mat6, mat7, mat8);
1014 #if defined(DEPTH_MULTIPLIER) 1052 __kernel
void depthwise_convolution_3x3_f16(
1056 #
if defined(HAS_BIAS)
1064 #if defined(HAS_BIAS) 1066 #endif //defined(HAS_BIAS) 1069 const int channel = get_global_id(2) % DST_CHANNELS;
1070 const int batch = get_global_id(2) / DST_CHANNELS;
1072 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1073 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1074 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1077 half3 weights_values0 = vload3(0, (__global
half *)(weights_addr + offset.s0));
1078 half3 weights_values1 = vload3(0, (__global
half *)(weights_addr + offset.s1));
1079 half3 weights_values2 = vload3(0, (__global
half *)(weights_addr + offset.s2));
1081 half4 pixels = convolution3x3_f16(src_addr, src_stride_y, weights_values0.s0, weights_values0.s1, weights_values0.s2,
1082 weights_values1.s0, weights_values1.s1, weights_values1.s2,
1083 weights_values2.s0, weights_values2.s1, weights_values2.s2);
1084 #if defined(HAS_BIAS) 1085 pixels += (half4)(*((__global
half *)(biases.
ptr + channel * biases_stride_x)));
1086 #endif //defined(HAS_BIAS) 1090 #endif // defined(DEPTH_MULTIPLIER) 1091 #endif // defined(CONV_STRIDE_X) 1130 __kernel
void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(
1134 #
if defined(HAS_BIAS)
1144 const int channel = get_global_id(2) % DST_CHANNELS;
1145 const int batch = get_global_id(2) / DST_CHANNELS;
1153 half4 pixels0 = 0.0f;
1154 half4 pixels1 = 0.0f;
1155 half4 pixels2 = 0.0f;
1156 half4 pixels3 = 0.0f;
1159 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1160 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1161 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1163 #if(DILATION_X == 1 && DILATION_Y == 1) 1170 half8 src00 = vload8(0, (__global
half *)(src_addr + 0 * src_stride_y));
1171 half8 src10 = vload8(0, (__global
half *)(src_addr + 1 * src_stride_y));
1172 half8 src20 = vload8(0, (__global
half *)(src_addr + 2 * src_stride_y));
1173 half8 src30 = vload8(0, (__global
half *)(src_addr + 3 * src_stride_y));
1174 half8 src40 = vload8(0, (__global
half *)(src_addr + 4 * src_stride_y));
1175 half8 src50 = vload8(0, (__global
half *)(src_addr + 5 * src_stride_y));
1193 pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr,
weights_stride_y);
1195 pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src_stride_x, src_stride_y, 1, weights_addr,
weights_stride_y);
1197 pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr,
weights_stride_y);
1199 pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src_stride_x, src_stride_y, 3, weights_addr,
weights_stride_y);
1204 pixels0 += (half4)bias;
1205 pixels1 += (half4)bias;
1206 pixels2 += (half4)bias;
1207 pixels3 += (half4)bias;
1253 __kernel
void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(
1257 #
if defined(HAS_BIAS)
1267 const int channel = get_global_id(2) % DST_CHANNELS;
1268 const int batch = get_global_id(2) / DST_CHANNELS;
1276 half4 pixels0 = 0.0f;
1277 half4 pixels1 = 0.0f;
1280 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
1281 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *
1282 (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
1284 #if(DILATION_X == 1 && DILATION_Y == 1) 1292 half8 src00 = vload8(0, (__global
half *)(src_addr + 0 * src_stride_y));
1293 half2 src01 = vload2(4, (__global
half *)(src_addr + 0 * src_stride_y));
1294 half8 src10 = vload8(0, (__global
half *)(src_addr + 1 * src_stride_y));
1295 half2 src11 = vload2(4, (__global
half *)(src_addr + 1 * src_stride_y));
1296 half8 src20 = vload8(0, (__global
half *)(src_addr + 2 * src_stride_y));
1297 half2 src21 = vload2(4, (__global
half *)(src_addr + 2 * src_stride_y));
1298 half8 src30 = vload8(0, (__global
half *)(src_addr + 3 * src_stride_y));
1299 half2 src31 = vload2(4, (__global
half *)(src_addr + 3 * src_stride_y));
1300 half8 src40 = vload8(0, (__global
half *)(src_addr + 4 * src_stride_y));
1301 half2 src41 = vload2(4, (__global
half *)(src_addr + 4 * src_stride_y));
1312 pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr,
weights_stride_y);
1314 pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr,
weights_stride_y);
1318 pixels0 += (half4)bias;
1319 pixels1 += (half4)bias;
1325 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16) 1327 #if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER) 1378 __kernel
void dwc_MxN_native_fp_nhwc(
1382 #
if defined(HAS_BIAS)
1388 int x_offs = max((
int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) *
sizeof(
DATA_TYPE);
1390 int x = get_global_id(0);
1391 int y = get_global_id(1);
1392 #if defined(DST_DEPTH) 1393 int z = get_global_id(2) % (int)DST_DEPTH;
1394 int b = get_global_id(2) / (int)DST_DEPTH;
1395 #else // defined(DST_DEPTH) 1396 int z = get_global_id(2);
1397 #endif // defined(DST_DEPTH) 1399 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;
1401 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
1403 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
1405 #if defined(HAS_BIAS) 1406 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;
1407 #endif // defined(HAS_BIAS) 1409 #if defined(DST_DEPTH) 1410 s_addr += b * src_stride_w;
1411 d_addr += b * dst_stride_w;
1412 #endif // defined(DST_DEPTH) 1414 for(
int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1420 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1421 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1423 for(
int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1425 if(y_coord >= 0 && y_coord < SRC_DIM2)
1427 int x_coord_tmp = x_coord;
1429 for(
int xk = 0; xk < KERNEL_WIDTH; ++xk)
1431 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1433 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (
int)src_stride_z;
1442 #if GPU_ARCH == GPU_ARCH_MIDGARD 1444 #else // GPU_ARCH == GPU_ARCH_MIDGARD 1445 res0 =
fma(i, w, res0);
1446 #endif // GPU_ARCH == GPU_ARCH_MIDGARD 1448 x_coord_tmp += DILATION_X;
1451 y_coord += DILATION_Y;
1454 #if defined(HAS_BIAS) 1456 #endif // defined(HAS_BIAS) 1464 #if defined(HAS_BIAS) 1466 #endif // defined(HAS_BIAS) 1469 #endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER) 1471 #if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE) 1473 #if DATA_TYPE != float || DATA_TYPE != half 1474 #error "Unsupported data type" 1475 #endif // DATA_TYPE != float || DATA_TYPE != half 1477 #define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 1479 #define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \ 1481 basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \ 1482 basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \ 1483 basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \ 1486 #define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \ 1488 FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \ 1489 basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \ 1492 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) 1545 __kernel
void depthwise_convolution_3x3_nhwc(
1549 #
if defined(HAS_BIAS)
1556 int y = get_global_id(1);
1557 #if defined(DST_DEPTH) 1558 int z = get_global_id(2) % (int)DST_DEPTH;
1559 int b = get_global_id(2) / (int)DST_DEPTH;
1560 #else // defined(DST_DEPTH) 1561 int z = get_global_id(2);
1562 #endif // defined(DST_DEPTH) 1564 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
1566 #if defined(DST_DEPTH) 1567 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
1569 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
1572 int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);
1573 int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);
1575 int3 src_offset_y =
clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));
1576 int3 src_offset_z =
clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));
1579 src_coord_y = (src_offset_y != src_coord_y);
1580 src_coord_z = (src_offset_z != src_coord_z);
1582 src_offset_y *= (int3)src_stride_y;
1583 src_offset_z *= (int3)src_stride_z;
1605 FILL_ZERO_OUT_OF_BOUND_3(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);
1607 acc0 =
fma(values0, w0, acc0);
1608 acc0 =
fma(values1, w1, acc0);
1609 acc0 =
fma(values2, w2, acc0);
1612 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1613 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1614 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1616 FILL_ZERO_OUT_OF_BOUND_3(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);
1618 acc0 =
fma(values0, w3, acc0);
1619 acc0 =
fma(values1, w4, acc0);
1620 acc0 =
fma(values2, w5, acc0);
1623 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1624 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1625 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
1627 FILL_ZERO_OUT_OF_BOUND_3(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);
1629 acc0 =
fma(values0, w6, acc0);
1630 acc0 =
fma(values1, w7, acc0);
1631 acc0 =
fma(values2, w8, acc0);
1633 #if defined(HAS_BIAS) 1634 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
1636 acc0 += bias_values;
1637 #endif // defined(HAS_BIAS) 1639 #if defined(DST_DEPTH) 1640 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1642 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;
1648 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) 1650 #if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) 1704 __kernel
void depthwise_convolution_3x3_nhwc_stride1(
1708 #
if defined(HAS_BIAS)
1715 int y = get_global_id(1);
1716 #if defined(DST_DEPTH) 1717 int z = get_global_id(2) % (int)DST_DEPTH;
1718 int b = get_global_id(2) / (int)DST_DEPTH;
1719 #else // defined(DST_DEPTH) 1720 int z = get_global_id(2);
1721 #endif // defined(DST_DEPTH) 1723 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;
1725 #if defined(DST_DEPTH) 1726 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;
1728 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;
1731 int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) +
V_OFFS4(
int);
1732 int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) +
V_OFFS4(
int);
1734 int4 src_offset_y =
clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));
1735 int4 src_offset_z =
clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));
1738 src_coord_y = (src_offset_y != src_coord_y);
1739 src_coord_z = (src_offset_z != src_coord_z);
1741 src_offset_y *= (int4)src_stride_y;
1742 src_offset_z *= (int4)src_stride_z;
1768 FILL_ZERO_OUT_OF_BOUND_4(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);
1770 acc0 =
fma(values0, w0, acc0);
1771 acc0 =
fma(values1, w1, acc0);
1772 acc0 =
fma(values2, w2, acc0);
1773 acc1 =
fma(values1, w0, acc1);
1774 acc1 =
fma(values2, w1, acc1);
1775 acc1 =
fma(values3, w2, acc1);
1778 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));
1779 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));
1780 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));
1781 values3 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));
1783 FILL_ZERO_OUT_OF_BOUND_4(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);
1785 acc0 =
fma(values0, w3, acc0);
1786 acc0 =
fma(values1, w4, acc0);
1787 acc0 =
fma(values2, w5, acc0);
1788 acc1 =
fma(values1, w3, acc1);
1789 acc1 =
fma(values2, w4, acc1);
1790 acc1 =
fma(values3, w5, acc1);
1792 acc2 =
fma(values0, w0, acc2);
1793 acc2 =
fma(values1, w1, acc2);
1794 acc2 =
fma(values2, w2, acc2);
1795 acc3 =
fma(values1, w0, acc3);
1796 acc3 =
fma(values2, w1, acc3);
1797 acc3 =
fma(values3, w2, acc3);
1800 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));
1801 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));
1802 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));
1803 values3 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));
1805 FILL_ZERO_OUT_OF_BOUND_4(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);
1807 acc0 =
fma(values0, w6, acc0);
1808 acc0 =
fma(values1, w7, acc0);
1809 acc0 =
fma(values2, w8, acc0);
1810 acc1 =
fma(values1, w6, acc1);
1811 acc1 =
fma(values2, w7, acc1);
1812 acc1 =
fma(values3, w8, acc1);
1814 acc2 =
fma(values0, w3, acc2);
1815 acc2 =
fma(values1, w4, acc2);
1816 acc2 =
fma(values2, w5, acc2);
1817 acc3 =
fma(values1, w3, acc3);
1818 acc3 =
fma(values2, w4, acc3);
1819 acc3 =
fma(values3, w5, acc3);
1822 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));
1823 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));
1824 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));
1825 values3 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));
1827 FILL_ZERO_OUT_OF_BOUND_4(
DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);
1829 acc2 =
fma(values0, w6, acc2);
1830 acc2 =
fma(values1, w7, acc2);
1831 acc2 =
fma(values2, w8, acc2);
1832 acc3 =
fma(values1, w6, acc3);
1833 acc3 =
fma(values2, w7, acc3);
1834 acc3 =
fma(values3, w8, acc3);
1836 #if defined(HAS_BIAS) 1837 __global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;
1841 acc0 += bias_values;
1842 acc1 += bias_values;
1843 acc2 += bias_values;
1844 acc3 += bias_values;
1845 #endif // defined(HAS_BIAS) 1847 int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) +
V_OFFS2(
int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;
1848 int dst_coord_z = z * NUM_PLANES_PROCESSED;
1850 #if defined(DST_DEPTH) 1851 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;
1852 #else // defined(DST_DEPTH) 1853 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;
1854 #endif // defined(DST_DEPTH) 1866 #if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) 1867 if((dst_coord_z + 1) < DST_DIM_2)
1868 #endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) 1878 #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) 1879 #endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE) Structure to hold Vector information.
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
const size_t weights_stride_y
half_float::half half
16-bit floating point type
#define IMAGE_DECLARATION(name)
Structure to hold 3D tensor information.
#define CONVOLUTION1x3_BIFROST4X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0)
SimpleTensor< float > src
DataType clamp(const DataType &n, const DataType &lower=std::numeric_limits< RangeType >::lowest(), const DataType &upper=std::numeric_limits< RangeType >::max())
Performs clamping among a lower and upper value.
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
#define CONVOLUTION1x3_BIFROST2X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0)
#define CONVERT_TO_VECTOR_STRUCT(name)
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
#define VECTOR_DECLARATION(name)
short8 convolution3x3(Image *src, const short mat0, const short mat1, const short mat2, const short mat3, const short mat4, const short mat5, const short mat6, const short mat7, const short mat8, uint scale)
Apply a 3x3 convolution matrix to a single channel U8 input image and return the result.
__global uchar * ptr
Pointer to the starting postion of the buffer.
Structure to hold Image information.
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
__global uchar * ptr
Pointer to the starting postion of the buffer.
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
#define CONVOLUTION1x3_BIFROST4X1_STRIDE2(acc, src0_left, src0_mid, src0_right, weights_row0)
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type...
const size_t weights_stride_z
#define TENSOR4D_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
__global uchar * ptr_offset(__global uchar *ptr, const int x, const int y, const int stride_x, const int stride_y)
Get the pointer position at a certain offset in x and y direction.
#define TENSOR3D_DECLARATION(name)
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
#define ACTIVATION(op, DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)
short8 convolution1x3(__global const uchar *left_pixel, const short left_coeff, const short middle_coeff, const short right_coeff)
Compute a 1D horizontal convolution of size 3 for 8 bytes assuming the input is made of 1 channel of ...
#define CONVOLUTION1x3_BIFROST2X1_STRIDE1(acc, src0_left, src0_mid, src0_right, weights_row0)
#define VEC_DATA_TYPE(type, size)