35 #if defined(ACTIVATION_TYPE) && defined(CONST_0) 37 #define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x) 39 #define ACTIVATION_FUNC(x) (x) 42 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) 43 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) 44 #define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE) 46 #if defined(DATA_TYPE) && defined(WEIGHTS_TYPE) 48 #define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size) 50 #if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) 52 #if defined(WEIGHTS_PROMOTED_TYPE) 53 #define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size) 55 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 56 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 57 #define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val); 58 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 59 #define ARM_DOT(x, y, val) val += arm_dot((x), (y)); 60 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 61 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 63 #if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) 66 #error "Stride X not supported" 73 #if CONV_STRIDE_X == 1 74 #define GET_VALUES(first_value, left, middle, right) \ 76 int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \ 77 int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \ 79 left = CONVERT(temp0.s01234567, int8); \ 80 middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \ 81 right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \ 83 #elif CONV_STRIDE_X == 2 84 #define GET_VALUES(first_value, left, middle, right) \ 86 int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ 87 int temp1 = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \ 89 left = CONVERT(temp0.s02468ace, int8); \ 90 middle = CONVERT(temp0.s13579bdf, int8); \ 91 right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \ 94 #define GET_VALUES(first_value, left, middle, right) \ 96 int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ 97 int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \ 99 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ 100 middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \ 101 right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \ 107 #if CONV_STRIDE_X == 1 108 #define GET_VALUES(first_value, left, middle, right) \ 110 left = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \ 111 middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8); \ 112 right = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \ 114 #elif CONV_STRIDE_X == 2 115 #define GET_VALUES(first_value, left, middle, right) \ 117 int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ 118 left = CONVERT(temp0.s02468ace, int8); \ 120 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \ 121 middle = CONVERT(temp0.s02468ace, int8); \ 123 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \ 124 right = CONVERT(temp0.s02468ace, int8); \ 127 #define GET_VALUES(first_value, left, middle, right) \ 129 int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \ 130 int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \ 131 left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ 133 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \ 134 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8); \ 135 middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ 137 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \ 138 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \ 139 right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \ 185 __kernel
void dwc_3x3_native_quantized8_nchw(
191 #
if defined(HAS_BIAS)
197 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
204 const int channel = get_global_id(2) % DST_CHANNELS;
205 const int batch = get_global_id(2) / DST_CHANNELS;
207 #if defined(HAS_BIAS) 210 int bias_value = *((__global
int *)(
vector_offset(&biases, channel)));
211 #endif //defined(HAS_BIAS) 214 src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
215 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
218 w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 *
weights_stride_y));
220 w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 *
weights_stride_y));
222 w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 *
weights_stride_y));
224 #if defined(PER_CHANNEL_QUANTIZATION) 225 const int output_multiplier = *((__global
int *)
vector_offset(&output_multipliers, channel));
226 const int output_shift = *((__global
int *)
vector_offset(&output_shifts, channel));
227 #endif // defined(PER_CHANNEL_QUANTIZATION) 231 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 237 int8 left, middle, right;
238 GET_VALUES(src_addr + 0 * src_stride_y, left, middle, right);
239 values0 += left * (int8)(w0.s0);
240 values0 += middle * (int8)(w0.s1);
241 values0 += right * (int8)(w0.s2);
243 #if WEIGHTS_OFFSET != 0 244 sum0 += left + middle + right;
248 GET_VALUES(src_addr + DILATION_Y * src_stride_y, left, middle, right);
249 values0 += left * (int8)(w1.s0);
250 values0 += middle * (int8)(w1.s1);
251 values0 += right * (int8)(w1.s2);
253 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 254 values1 += left * (int8)(w0.s0);
255 values1 += middle * (int8)(w0.s1);
256 values1 += right * (int8)(w0.s2);
259 #if WEIGHTS_OFFSET != 0 260 int8 tmp = left + middle + right;
262 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 268 GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left, middle, right);
269 values0 += left * (int8)(w2.s0);
270 values0 += middle * (int8)(w2.s1);
271 values0 += right * (int8)(w2.s2);
272 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 273 values1 += left * (int8)(w1.s0);
274 values1 += middle * (int8)(w1.s1);
275 values1 += right * (int8)(w1.s2);
278 #if WEIGHTS_OFFSET != 0 279 tmp = left + middle + right;
281 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 286 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 288 GET_VALUES(src_addr + 3 * src_stride_y, left, middle, right);
289 values1 += left * (int8)(w2.s0);
290 values1 += middle * (int8)(w2.s1);
291 values1 += right * (int8)(w2.s2);
293 #if WEIGHTS_OFFSET != 0 294 sum1 += left + middle + right;
298 #if defined(HAS_BIAS) 299 values0 += (int8)(bias_value);
300 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 301 values1 += (int8)(bias_value);
303 #endif //defined(HAS_BIAS) 305 #if WEIGHTS_OFFSET != 0 306 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
307 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 308 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
312 #if INPUT_OFFSET != 0 313 VEC_WEIGHTS_PROMOTED_TYPE(3)
314 tmp_we =
CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) +
CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) +
CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
316 WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
317 values0 += sum_weights * (int8)(INPUT_OFFSET);
318 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 319 values1 += sum_weights * (int8)(INPUT_OFFSET);
324 values0 += (int8)(K_OFFSET);
325 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 326 values1 += (int8)(K_OFFSET);
330 #if defined(REAL_MULTIPLIER) 334 #else // defined(REAL_MULTIPLIER) 336 #if defined(PER_CHANNEL_QUANTIZATION) 339 values0 =
select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
340 #else // defined(PER_CHANNEL_QUANTIZATION) 343 #else // OUTPUT_SHIFT < 0 345 #endif // OUTPUT_OFFSET < 0 346 #endif // defined(PER_CHANNEL_QUANTIZATION) 348 #endif // defined(REAL_MULTIPLIER) 350 values0 += (int8)OUTPUT_OFFSET;
355 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 356 #if defined(REAL_MULTIPLIER) 360 #else // defined(REAL_MULTIPLIER) 362 #if defined(PER_CHANNEL_QUANTIZATION) 365 values1 =
select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
366 #else // defined(PER_CHANNEL_QUANTIZATION) 369 #else // OUTPUT_SHIFT < 0 371 #endif // OUTPUT_OFFSET < 0 372 #endif // defined(PER_CHANNEL_QUANTIZATION) 374 #endif // defined(REAL_MULTIPLIER) 376 values1 += (int8)OUTPUT_OFFSET;
384 #else // !defined(IS_DOT8) 387 #if CONV_STRIDE_X == 1 388 #define GET_VALUES(first_value, left, middle, right) \ 391 temp0 = vload8(0, (__global DATA_TYPE *)(first_value)); \ 393 temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \ 395 left = temp0.s01234567; \ 396 middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0); \ 397 right = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01); \ 399 #elif CONV_STRIDE_X == 2 400 #define GET_VALUES(first_value, left, middle, right) \ 403 temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ 404 DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \ 406 left = temp0.s02468ace; \ 407 middle = temp0.s13579bdf; \ 408 right = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1); \ 411 #define GET_VALUES(first_value, left, middle, right) \ 414 temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ 416 temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \ 418 left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ 419 middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036); \ 420 right = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147); \ 425 #if CONV_STRIDE_X == 1 426 #define GET_VALUES(first_value, left, middle, right) \ 428 left = vload8(0, (__global DATA_TYPE *)(first_value)); \ 429 middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ 430 right = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ 432 #elif CONV_STRIDE_X == 2 433 #define GET_VALUES(first_value, left, middle, right) \ 436 temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ 437 left = temp0.s02468ace; \ 438 temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ 439 middle = temp0.s02468ace; \ 440 temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ 441 right = temp0.s02468ace; \ 444 #define GET_VALUES(first_value, left, middle, right) \ 447 temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \ 449 temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE)))); \ 450 left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ 452 temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \ 453 temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))); \ 454 middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ 456 temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \ 457 temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \ 458 right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \ 503 __kernel
void dwc_3x3_native_quantized8_dot8_nchw(
509 #
if defined(HAS_BIAS)
515 __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
522 const int channel = get_global_id(2) % DST_CHANNELS;
523 const int batch = get_global_id(2) / DST_CHANNELS;
525 #if defined(HAS_BIAS) 528 const int bias_value = *((__global
int *)(
vector_offset(&biases, channel)));
529 #endif //defined(HAS_BIAS) 532 src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
533 __global uchar *weights_addr = weights.
ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
536 w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 *
weights_stride_y));
538 w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 *
weights_stride_y));
540 w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 *
weights_stride_y));
542 const int output_multiplier = *((__global
int *)
vector_offset(&output_multipliers, 0));
543 const int output_shift = *((__global
int *)
vector_offset(&output_shifts, 0));
546 left0, middle0, right0;
548 left1, middle1, right1;
550 left2, middle2, right2;
555 GET_VALUES(src_addr + 0 * src_stride_y, left0, middle0, right0);
556 GET_VALUES(src_addr + DILATION_Y * src_stride_y, left1, middle1, right1);
557 GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
559 #if WEIGHTS_OFFSET != 0 560 sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
561 sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
562 sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
565 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 569 left3, middle3, right3;
573 GET_VALUES(src_addr + 3 * src_stride_y, left3, middle3, right3);
575 #if WEIGHTS_OFFSET != 0 576 sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
577 sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
578 sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
580 #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1 582 ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
583 ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
584 values0.s0 += right2.s0 * w2.s2;
586 ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
587 ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
588 values0.s1 += right2.s1 * w2.s2;
590 ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
591 ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
592 values0.s2 += right2.s2 * w2.s2;
594 ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
595 ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
596 values0.s3 += right2.s3 * w2.s2;
598 ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
599 ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
600 values0.s4 += right2.s4 * w2.s2;
602 ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
603 ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
604 values0.s5 += right2.s5 * w2.s2;
606 ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
607 ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
608 values0.s6 += right2.s6 * w2.s2;
610 ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
611 ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
612 values0.s7 += right2.s7 * w2.s2;
614 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 615 ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
616 ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
617 values1.s0 += right3.s0 * w2.s2;
619 ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
620 ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
621 values1.s1 += right3.s1 * w2.s2;
623 ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
624 ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
625 values1.s2 += right3.s2 * w2.s2;
627 ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
628 ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
629 values1.s3 += right3.s3 * w2.s2;
631 ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
632 ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
633 values1.s4 += right3.s4 * w2.s2;
635 ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
636 ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
637 values1.s5 += right3.s5 * w2.s2;
639 ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
640 ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
641 values1.s6 += right3.s6 * w2.s2;
643 ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
644 ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
645 values1.s7 += right3.s7 * w2.s2;
646 #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1 648 #if defined(HAS_BIAS) 649 values0 += (int8)(bias_value);
650 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 651 values1 += (int8)(bias_value);
653 #endif //defined(HAS_BIAS) 655 #if WEIGHTS_OFFSET != 0 656 values0 += sum0 * (int8)(WEIGHTS_OFFSET);
657 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 658 values1 += sum1 * (int8)(WEIGHTS_OFFSET);
662 #if INPUT_OFFSET != 0 663 WEIGHTS_PROMOTED_TYPE sum_weights = 0;
664 VEC_WEIGHTS_PROMOTED_TYPE(3)
665 tmp_we =
CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) +
CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) +
CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
666 sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
667 values0 += sum_weights * (int8)(INPUT_OFFSET);
668 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 669 values1 += sum_weights * (int8)(INPUT_OFFSET);
674 values0 += (int8)(K_OFFSET);
675 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 676 values1 += (int8)(K_OFFSET);
680 #if defined(REAL_MULTIPLIER) 684 #else // defined(REAL_MULTIPLIER) 686 #if defined(PER_CHANNEL_QUANTIZATION) 689 values0 =
select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
690 #else // defined(PER_CHANNEL_QUANTIZATION) 693 #else // OUTPUT_SHIFT < 0 695 #endif // OUTPUT_OFFSET < 0 696 #endif // defined(PER_CHANNEL_QUANTIZATION) 698 #endif // defined(REAL_MULTIPLIER) 700 values0 += (int8)OUTPUT_OFFSET;
705 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1 707 #if defined(REAL_MULTIPLIER) 711 #else // defined(REAL_MULTIPLIER) 713 #if defined(PER_CHANNEL_QUANTIZATION) 716 values1 =
select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
717 #else // defined(PER_CHANNEL_QUANTIZATION) 720 #else // OUTPUT_SHIFT < 0 722 #endif // OUTPUT_OFFSET < 0 723 #endif // defined(PER_CHANNEL_QUANTIZATION) 725 #endif // defined(REAL_MULTIPLIER) 727 values1 += (int8)OUTPUT_OFFSET;
735 #endif // !defined(IS_DOT8) 739 #if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) 741 #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE) 743 #define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT) 745 #if WEIGHTS_OFFSET != 0 746 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \ 748 sum += CONVERT(x, VEC_INT); \ 749 MULTIPLY_ADD(x, y, acc); \ 752 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc) 755 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 756 #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \ 758 ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc); \ 759 ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc); \ 763 #define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \ 766 ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum); \ 767 ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum); \ 770 #define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \ 773 ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum); \ 774 ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum); \ 777 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 779 #if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4 831 __kernel
void dwc_3x3_reshaped_quantized8_nhwc(
837 #
if defined(HAS_BIAS)
842 const int x = get_global_id(0);
843 const int y = get_global_id(1);
844 #if defined(DST_DEPTH) 845 int z = get_global_id(2) % (int)DST_DEPTH;
846 int b = get_global_id(2) / (int)DST_DEPTH;
847 #else // defined(DST_DEPTH) 848 int z = get_global_id(2);
849 #endif // defined(DST_DEPTH) 851 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x *
weights_stride_y;
853 #if defined(DST_DEPTH) 854 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
VEC_SIZE + b * src_stride_w;
856 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
VEC_SIZE;
861 int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (
int)CONV_PAD_LEFT;
864 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
865 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
866 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
867 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
869 int4 y_offset = convert_int4(y_coord * (
int)src_stride_y);
876 w0_tmp =
VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
878 w1_tmp =
VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
880 w8 =
VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));
900 #if INPUT_OFFSET != 0 911 z_coord = z * (int)CONV_STRIDE_Y - (
int)CONV_PAD_TOP;
912 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
913 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
915 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
917 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
919 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
924 z_coord = z * (int)CONV_STRIDE_Y - (
int)CONV_PAD_TOP + DILATION_Y;
925 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
926 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
928 values3 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
930 values4 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
932 values5 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
936 z_coord = z * (int)CONV_STRIDE_Y - (
int)CONV_PAD_TOP + DILATION_Y * 2;
937 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
938 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
940 values6 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
942 values7 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
944 values8 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
946 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc,
sum);
947 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc,
sum);
948 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc,
sum);
950 MULTIPLY_ADD_ACCUMULATE(values3, w3, acc,
sum);
951 MULTIPLY_ADD_ACCUMULATE(values4, w4, acc,
sum);
952 MULTIPLY_ADD_ACCUMULATE(values5, w5, acc,
sum);
954 MULTIPLY_ADD_ACCUMULATE(values6, w6, acc,
sum);
955 MULTIPLY_ADD_ACCUMULATE(values7, w7, acc,
sum);
956 MULTIPLY_ADD_ACCUMULATE(values8, w8, acc,
sum);
958 #if defined(HAS_BIAS) 962 #endif // defined(HAS_BIAS) 964 #if WEIGHTS_OFFSET != 0 965 acc += WEIGHTS_OFFSET *
sum;
968 #if INPUT_OFFSET != 0 969 acc += INPUT_OFFSET * sum_we;
976 #if defined(REAL_MULTIPLIER) 980 #else // defined(REAL_MULTIPLIER) 982 #if defined(PER_CHANNEL_QUANTIZATION) 985 VEC_INT output_multiplier =
VLOAD(VEC_SIZE)(0, (__global
int *)output_multipliers.
ptr);
986 VEC_INT output_shift =
VLOAD(VEC_SIZE)(0, (__global
int *)output_shifts.
ptr);
989 VEC_INT res_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
990 acc =
select(res_shift_lt0, res_shift_gt0, output_shift >= 0);
991 #else // defined(PER_CHANNEL_QUANTIZATION) 994 #else // OUTPUT_SHIFT < 0 995 acc = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
996 #endif // OUTPUT_SHIFT < 0 997 #endif // defined(PER_CHANNEL_QUANTIZATION) 999 #endif // defined(REAL_MULTIPLIER) 1001 acc += (
VEC_INT)OUTPUT_OFFSET;
1006 #if defined(DST_DEPTH) 1007 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1009 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1015 #endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) 1017 #if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4 1070 __kernel
void dwc_3x3_reshaped_quantized8_stride1_nhwc(
1076 #
if defined(HAS_BIAS)
1081 int x = get_global_id(0);
1082 int y = get_global_id(1);
1083 #if defined(DST_DEPTH) 1084 int z = get_global_id(2) % (int)DST_DEPTH;
1085 int b = get_global_id(2) / (int)DST_DEPTH;
1086 #else // defined(DST_DEPTH) 1087 int z = get_global_id(2);
1088 #endif // defined(DST_DEPTH) 1090 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x *
weights_stride_y;
1092 #if defined(DST_DEPTH) 1093 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
VEC_SIZE + b * src_stride_w;
1095 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
VEC_SIZE;
1100 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (
int)CONV_PAD_LEFT;
1103 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
1104 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
1105 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
1106 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
1108 int4 y_offset = convert_int4(y_coord * (
int)src_stride_y);
1118 w0_tmp =
VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
1120 w1_tmp =
VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
1122 w8 =
VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));
1142 #if INPUT_OFFSET != 0 1153 z_coord = z * (int)NUM_PLANES_PROCESSED - (
int)CONV_PAD_TOP;
1154 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1155 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
1157 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1159 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1161 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1163 values3 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1166 z_coord = z * (int)NUM_PLANES_PROCESSED - (
int)CONV_PAD_TOP + 1;
1167 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1168 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
1170 values4 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1172 values5 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1174 values6 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1176 values7 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1179 z_coord = z * (int)NUM_PLANES_PROCESSED - (
int)CONV_PAD_TOP + 2;
1180 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1181 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
1183 values8 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1185 values9 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1187 values10 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1189 values11 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1192 z_coord = z * (int)NUM_PLANES_PROCESSED - (
int)CONV_PAD_TOP + 3;
1193 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1194 offset =
select(y_offset + (int4)(z_coord * src_stride_z), (int4)max_offset, (int4)z_coord < 0 || (int4)z_coord >= SRC_DIM_2);
1196 values12 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1198 values13 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1200 values14 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1202 values15 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1204 MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);
1205 MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);
1206 MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);
1207 MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);
1208 MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);
1209 MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);
1211 MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);
1212 MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);
1213 MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);
1214 MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);
1215 MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);
1216 MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);
1218 MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);
1219 MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);
1220 MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);
1221 MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);
1222 MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);
1223 MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);
1225 MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);
1226 MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);
1227 MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);
1228 MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);
1229 MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);
1230 MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);
1232 MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);
1233 MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);
1234 MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);
1235 MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);
1236 MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);
1237 MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);
1239 MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);
1240 MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);
1241 MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);
1242 MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);
1243 MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);
1244 MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);
1246 #if defined(HAS_BIAS) 1251 acc0 += bias_values;
1252 acc1 += bias_values;
1253 acc2 += bias_values;
1254 acc3 += bias_values;
1257 #if WEIGHTS_OFFSET != 0 1258 acc0 += WEIGHTS_OFFSET * sum0;
1259 acc1 += WEIGHTS_OFFSET * sum1;
1260 acc2 += WEIGHTS_OFFSET * sum2;
1261 acc3 += WEIGHTS_OFFSET * sum3;
1264 #if INPUT_OFFSET != 0 1265 VEC_INT offs = INPUT_OFFSET * sum_we;
1280 #if defined(REAL_MULTIPLIER) 1287 #else // defined(REAL_MULTIPLIER) 1289 #if defined(PER_CHANNEL_QUANTIZATION) 1292 VEC_INT output_multiplier =
VLOAD(VEC_SIZE)(0, (__global
int *)output_multipliers.
ptr);
1293 VEC_INT output_shift =
VLOAD(VEC_SIZE)(0, (__global
int *)output_shifts.
ptr);
1299 VEC_INT res0_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
1300 VEC_INT res1_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
1301 VEC_INT res2_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
1302 VEC_INT res3_shift_gt0 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
1303 acc0 =
select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);
1304 acc1 =
select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);
1305 acc2 =
select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);
1306 acc3 =
select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);
1307 #else // defined(PER_CHANNEL_QUANTIZATION) 1308 #if OUTPUT_SHIFT < 0 1313 #else // OUTPUT_SHIFT < 0 1314 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1315 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1316 acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1317 acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1318 #endif // OUTPUT_SHIFT < 0 1319 #endif // defined(PER_CHANNEL_QUANTIZATION) 1321 #endif // defined(REAL_MULTIPLIER) 1323 acc0 += (
VEC_INT)OUTPUT_OFFSET;
1324 acc1 += (
VEC_INT)OUTPUT_OFFSET;
1325 acc2 += (
VEC_INT)OUTPUT_OFFSET;
1326 acc3 += (
VEC_INT)OUTPUT_OFFSET;
1337 #if defined(DST_DEPTH) 1338 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;
1340 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;
1348 #if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0) 1349 if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)
1359 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4 1414 __kernel
void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(
1420 #
if defined(HAS_BIAS)
1425 int x = get_global_id(0);
1426 int y = get_global_id(1);
1427 #if defined(DST_DEPTH) 1428 int z = get_global_id(2) % (int)DST_DEPTH;
1429 int b = get_global_id(2) / (int)DST_DEPTH;
1430 #else // defined(DST_DEPTH) 1431 int z = get_global_id(2);
1432 #endif // defined(DST_DEPTH) 1434 __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x *
weights_stride_y;
1436 #if defined(DST_DEPTH) 1437 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
VEC_SIZE + b * src_stride_w;
1439 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
VEC_SIZE;
1444 int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (
int)CONV_PAD_LEFT;
1447 y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);
1448 y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);
1449 y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);
1450 y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);
1452 int4 y_offset = convert_int4(y_coord * (
int)src_stride_y);
1462 w0 =
VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));
1464 w1 =
VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));
1466 w2 =
VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32));
1468 #if INPUT_OFFSET != 0 1470 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);
1471 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1472 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);
1473 DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
1476 acc0 = INPUT_OFFSET * acc0;
1479 #endif // INPUT_OFFSET != 0 1486 z_coord = z - (int)CONV_PAD_TOP;
1487 z_coord = min((uint)z_coord, (uint)SRC_DIM_2);
1488 offset = y_offset + (int4)(z_coord * src_stride_z);
1489 offset = min(offset, (int4)max_offset);
1492 values0 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1494 values1 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1496 values2 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1498 values3 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1503 z_coord = z - (int)CONV_PAD_TOP + 1;
1504 offset = y_offset + (int4)(z_coord * src_stride_z);
1506 values4 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1508 values5 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1510 values6 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1512 values7 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1517 offset += (int4)src_stride_z;
1518 offset = min(offset, (int4)max_offset);
1520 values8 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s0));
1522 values9 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s1));
1524 values10 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s2));
1526 values11 =
VLOAD(VEC_SIZE)(0, (__global
DATA_TYPE *)(src_addr + offset.s3));
1528 DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);
1529 DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);
1530 DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);
1531 DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);
1533 DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);
1534 DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);
1535 DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1536 DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);
1538 DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);
1539 DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);
1540 DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);
1541 DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);
1543 DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);
1544 DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);
1545 DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
1546 DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);
1548 #if defined(HAS_BIAS) 1553 acc0 += bias_values;
1554 acc1 += bias_values;
1556 #endif // defined(HAS_BIAS) 1558 #if WEIGHTS_OFFSET != 0 1559 acc0 += WEIGHTS_OFFSET * sum0;
1560 acc1 += WEIGHTS_OFFSET * sum1;
1561 #endif // WEIGHTS_OFFSET != 0 1567 #endif // K_OFFSET != 0 1569 #if defined(REAL_MULTIPLIER) 1574 #else // defined(REAL_MULTIPLIER) 1576 #if OUTPUT_SHIFT < 0 1579 #else // OUTPUT_SHIFT < 0 1580 acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1581 acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
1582 #endif // OUTPUT_SHIFT < 0 1584 #endif // defined(REAL_MULTIPLIER) 1585 acc0 += (
VEC_INT)OUTPUT_OFFSET;
1586 acc1 += (
VEC_INT)OUTPUT_OFFSET;
1593 #if defined(DST_DEPTH) 1594 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;
1596 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;
1604 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4 1606 #endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) 1608 #endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) 1610 #endif // defined(WEIGHTS_PROMOTED_TYPE) 1612 #endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER)) 1614 #if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER) 1672 __kernel
void dwc_MxN_native_quantized8_nhwc(
1678 #
if defined(HAS_BIAS)
1684 int x_offs = max((
int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0);
1685 int y = get_global_id(1);
1686 #if defined(DST_DEPTH) 1687 int z = get_global_id(2) % (int)DST_DEPTH;
1688 int b = get_global_id(2) / (int)DST_DEPTH;
1689 #else // defined(DST_DEPTH) 1690 int z = get_global_id(2);
1691 #endif // defined(DST_DEPTH) 1693 __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE);
1695 __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) * (
int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
1697 __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs *
sizeof(WEIGHTS_TYPE) * (
int)DEPTH_MULTIPLIER;
1699 #if defined(HAS_BIAS) 1700 __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs *
sizeof(int) * (
int)DEPTH_MULTIPLIER;
1701 #endif // defined(HAS_BIAS) 1703 #if defined(PER_CHANNEL_QUANTIZATION) 1704 __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs *
sizeof(int) * (
int)DEPTH_MULTIPLIER;
1705 __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs *
sizeof(int) * (
int)DEPTH_MULTIPLIER;
1706 #endif // defined(PER_CHANNEL_QUANTIZATION) 1708 #if defined(DST_DEPTH) 1709 s_addr += b * src_stride_w;
1710 d_addr += b * dst_stride_w;
1711 #endif // defined(DST_DEPTH) 1713 #if DEPTH_MULTIPLIER > 1 1714 for(
int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
1716 #endif // DEPTH_MULTIPLIER > 1 1720 int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
1721 int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
1723 for(
int yk = 0; yk < KERNEL_HEIGHT; ++yk)
1725 if(y_coord >= 0 && y_coord < SRC_DIM2)
1727 int x_coord_tmp = x_coord;
1729 for(
int xk = 0; xk < KERNEL_WIDTH; ++xk)
1731 if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
1733 int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (
int)src_stride_z;
1740 res += (i + (
VEC_INT)INPUT_OFFSET) * (w + (
VEC_INT)WEIGHTS_OFFSET);
1742 x_coord_tmp += DILATION_X;
1745 y_coord += DILATION_Y;
1748 #if defined(HAS_BIAS) 1751 #endif // defined(HAS_BIAS) 1753 #if defined(PER_CHANNEL_QUANTIZATION) 1754 VEC_INT output_multiplier =
VLOAD(N0)(0, (__global
int *)(out_mul_addr));
1755 VEC_INT output_shift =
VLOAD(N0)(0, (__global
int *)(out_shift_addr));
1759 res =
select(res_shift_lt0, res_shift_gt0, (
VEC_INT)(output_shift) >= 0);
1760 #else // defined(PER_CHANNEL_QUANTIZATION) 1761 #if OUTPUT_SHIFT < 0 1763 #else // OUTPUT_SHIFT < 0 1765 #endif // OUTPUT_OFFSET < 0 1766 #endif // defined(PER_CHANNEL_QUANTIZATION) 1768 res += (
VEC_INT)OUTPUT_OFFSET;
1776 #if DEPTH_MULTIPLIER > 1 1777 w_addr +=
sizeof(WEIGHTS_TYPE);
1779 #if defined(PER_CHANNEL_QUANTIZATION) 1780 out_mul_addr +=
sizeof(int);
1781 out_shift_addr +=
sizeof(int);
1782 #endif // defined(PER_CHANNEL_QUANTIZATION) 1783 #if defined(HAS_BIAS) 1784 b_addr +=
sizeof(int);
1785 #endif // defined(HAS_BIAS) 1787 #endif // DEPTH_MULTIPLIER > 1 1789 #endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER) 1790 #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE) Structure to hold Vector information.
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
const size_t weights_stride_y
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define IMAGE_DECLARATION(name)
#define CONVERT_SAT(a, b)
Structure to hold 3D tensor information.
SimpleTensor< float > src
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
#define CONVERT_TO_VECTOR_STRUCT(name)
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
#define VECTOR_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
Structure to hold Image information.
int round(float x, RoundingPolicy rounding_policy)
Return a rounded value of x.
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
__global uchar * ptr
Pointer to the starting postion of the buffer.
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size)
#define ACTIVATION_FUNC(x)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
const size_t weights_stride_z
#define TENSOR4D_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define TENSOR3D_DECLARATION(name)
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
#define VEC_DATA_TYPE(type, size)