28 #if defined(DATA_TYPE) && defined(ACC_DATA_TYPE) 30 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 31 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 32 #define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val)); 33 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 34 #define ARM_DOT(x, y, val) val += arm_dot((x), (y)); 35 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) 36 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 38 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 40 #define ARM_DOT1(a, b, c) \ 42 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \ 44 #define ARM_DOT2(a, b, c) \ 46 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \ 48 #define ARM_DOT3(a, b, c) \ 50 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \ 52 #define ARM_DOT4(a, b, c) \ 56 #define ARM_DOT8(a, b, c) \ 58 ARM_DOT4((a.lo), (b.lo), c); \ 59 ARM_DOT4((a.hi), (b.hi), c); \ 61 #define ARM_DOT16(a, b, c) \ 63 ARM_DOT8((a.lo), (b.lo), c); \ 64 ARM_DOT8((a.hi), (b.hi), c); \ 67 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 70 #define ARM_DOT1(a, b, c) \ 72 c += (ACC_DATA_TYPE)a * b; \ 74 #define ARM_DOT2(a, b, c) \ 76 c += (ACC_DATA_TYPE)a.s0 * b.s0; \ 77 c += (ACC_DATA_TYPE)a.s1 * b.s1; \ 79 #define ARM_DOT3(a, b, c) \ 82 c += (ACC_DATA_TYPE)a.s2 * b.s2; \ 84 #define ARM_DOT4(a, b, c) \ 87 c += (ACC_DATA_TYPE)a.s3 * b.s3; \ 89 #define ARM_DOT8(a, b, c) \ 91 ARM_DOT4((a.lo), (b.lo), c); \ 92 ARM_DOT4((a.hi), (b.hi), c); \ 94 #define ARM_DOT16(a, b, c) \ 96 ARM_DOT8((a.lo), (b.lo), c); \ 97 ARM_DOT8((a.hi), (b.hi), c); \ 99 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 102 #define ARM_DOT_K0X1(k0, a, b, c) \ 104 ARM_DOT_K0(k0, (a), (b##0), (c)); \ 106 #define ARM_DOT_K0X2(k0, a, b, c) \ 108 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \ 109 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \ 111 #define ARM_DOT_K0X3(k0, a, b, c) \ 113 ARM_DOT_K0X2(k0, a, b, c); \ 114 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \ 116 #define ARM_DOT_K0X4(k0, a, b, c) \ 118 ARM_DOT_K0X3(k0, a, b, c); \ 119 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \ 121 #define ARM_DOT_K0X8(k0, a, b, c) \ 123 ARM_DOT_K0X4(k0, a, b, c); \ 124 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \ 125 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \ 126 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \ 127 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \ 129 #define ARM_DOT_K0X16(k0, a, b, c) \ 131 ARM_DOT_K0X8(k0, a, b, c); \ 132 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \ 133 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \ 134 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \ 135 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \ 136 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \ 137 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \ 138 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \ 139 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \ 143 #define ARM_MM_K0XN0X1(n0, k0, a, b, c) \ 145 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \ 147 #define ARM_MM_K0XN0X2(n0, k0, a, b, c) \ 149 ARM_MM_K0XN0X1(n0, k0, a, b, c); \ 150 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \ 152 #define ARM_MM_K0XN0X3(n0, k0, a, b, c) \ 154 ARM_MM_K0XN0X2(n0, k0, a, b, c); \ 155 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \ 157 #define ARM_MM_K0XN0X4(n0, k0, a, b, c) \ 159 ARM_MM_K0XN0X3(n0, k0, a, b, c); \ 160 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \ 162 #define ARM_MM_K0XN0X5(n0, k0, a, b, c) \ 164 ARM_MM_K0XN0X4(n0, k0, a, b, c); \ 165 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \ 167 #define ARM_MM_K0XN0X6(n0, k0, a, b, c) \ 169 ARM_MM_K0XN0X5(n0, k0, a, b, c); \ 170 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \ 172 #define ARM_MM_K0XN0X7(n0, k0, a, b, c) \ 174 ARM_MM_K0XN0X6(n0, k0, a, b, c); \ 175 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \ 177 #define ARM_MM_K0XN0X8(n0, k0, a, b, c) \ 179 ARM_MM_K0XN0X7(n0, k0, a, b, c); \ 180 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \ 183 #define ARM_DOT_K0(k0, a, b, c) \ 185 CONCAT(ARM_DOT, k0) \ 189 #define ARM_DOT_K0XN0(n0, k0, a, b, c) \ 191 CONCAT(ARM_DOT_K0X, n0) \ 195 #define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \ 197 CONCAT(ARM_MM_K0XN0X, m0) \ 202 #define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \ 204 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \ 206 #define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \ 208 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \ 209 c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \ 211 #define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \ 213 ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \ 214 c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \ 216 #define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \ 218 ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \ 219 c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \ 221 #define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \ 223 ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \ 224 c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \ 225 c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \ 226 c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \ 227 c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \ 229 #define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \ 231 ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \ 232 c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \ 233 c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \ 234 c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \ 235 c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \ 236 c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \ 237 c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \ 238 c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \ 239 c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \ 242 #define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \ 244 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \ 246 #define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \ 248 ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \ 249 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \ 251 #define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \ 253 ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \ 254 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \ 256 #define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \ 258 ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \ 259 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \ 261 #define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \ 263 ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \ 264 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \ 266 #define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \ 268 ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \ 269 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \ 271 #define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \ 273 ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \ 274 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \ 276 #define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \ 278 ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \ 279 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \ 281 #define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \ 283 CONCAT(ARM_MUL_N0X, k0) \ 284 (VECTOR_ACC_TYPE, (a), b, (c)); \ 286 #define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \ 288 CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \ 289 (VECTOR_ACC_TYPE, k0, a, b, c); \ 292 #if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 350 #
if defined(REINTERPRET_OUTPUT_AS_3D)
352 uint dst_cross_plane_pad
357 #define LHS_BLOCK_SIZE ((K0) * (M0)) 359 #if defined(LHS_INTERLEAVE) 360 #define LHS_OFFSET_X (K0) 361 #define LHS_STEP_X ((K0) * (V0)) 362 #define LHS_STEP_LOOP (1) 363 #else // defined(INTERLEAVE) 364 #define LHS_OFFSET_X (LHS_BLOCK_SIZE) 365 #define LHS_STEP_X (K0) 366 #define LHS_STEP_LOOP (V0) 367 #endif // defined(INTERLEAVE) 370 #define RHS_BLOCK_SIZE ((K0) * (N0)) 373 #if defined(RHS_INTERLEAVE) 374 #define RHS_OFFSET_X (K0) 375 #define RHS_STEP_X ((K0) * (H0)) 376 #define RHS_STEP_LOOP (1) 377 #else // defined(RHS_INTERLEAVE) 378 #define RHS_OFFSET_X (RHS_BLOCK_SIZE) 379 #define RHS_STEP_X (K0) 380 #define RHS_STEP_LOOP (H0) 381 #endif // defined(RHS_INTERLEAVE) 383 uint x = get_global_id(0);
384 uint y = get_global_id(1);
385 uint z = get_global_id(2);
387 #if defined(DUMMY_WORK_ITEMS) 388 if((x * N0 >=
N) || (y * M0 >=
M))
392 #endif // defined(DUMMY_WORK_ITEMS) 395 __global
DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
398 __global
DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
400 #if defined(MATRIX_B_DEPTH) 402 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
403 #else // defined(MATRIX_B_DEPTH) 404 rhs_addr += z * rhs_stride_z;
405 #endif // defined(MATRIX_B_DEPTH) 413 for(
int i = 0; i < k; i += K0)
422 ARM_MM_K0XN0XM0(M0, N0, K0, a,
b, c);
425 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
426 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
429 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 *
sizeof(
int)) + (y * (uint)M0 * dst_stride_y);
433 #if defined(REINTERPRET_OUTPUT_AS_3D) 435 CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
439 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
441 #else // defined(REINTERPRET_OUTPUT_AS_3D) 444 dst_addr += z * dst_stride_z;
446 #endif // defined(REINTERPRET_OUTPUT_AS_3D) 449 const bool cond_y = ((get_global_id(1) + 1) * M0 >=
M);
450 const bool cond_x = ((get_global_id(0) + 1) * N0 >=
N);
454 STORE_BLOCK_BOUNDARY_AWARE(M0, N0,
int, c_lp, dst_addr, dst_stride_y, zout,
PARTIAL_STORE_M0,
PARTIAL_STORE_N0, cond_y, cond_x);
456 #undef LHS_BLOCK_SIZE 459 #undef RHS_BLOCK_SIZE 463 #endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 465 #if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 521 #
if defined(REINTERPRET_INPUT_AS_3D)
523 uint lhs_cross_plane_pad
525 #
if defined(REINTERPRET_OUTPUT_AS_3D)
527 uint dst_cross_plane_pad
532 #define RHS_BLOCK_SIZE ((K0) * (N0)) 535 #if defined(RHS_INTERLEAVE) 536 #define RHS_OFFSET_X (K0) 537 #define RHS_STEP_X ((K0) * (H0)) 538 #define RHS_STEP_LOOP (1) 539 #else // defined(RHS_INTERLEAVE) 540 #define RHS_OFFSET_X (RHS_BLOCK_SIZE) 541 #define RHS_STEP_X (K0) 542 #define RHS_STEP_LOOP (H0) 543 #endif // defined(RHS_INTERLEAVE) 545 uint x = get_global_id(0);
546 uint y = get_global_id(1);
547 uint z = get_global_id(2);
549 #if defined(DUMMY_WORK_ITEMS) 550 if((x * N0 >=
N) || (y * M0 >=
M))
554 #endif // defined(DUMMY_WORK_ITEMS) 560 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
562 #if defined(MATRIX_B_DEPTH) 564 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
565 #else // defined(MATRIX_B_DEPTH) 566 rhs_offset += z * rhs_stride_z;
567 #endif // defined(MATRIX_B_DEPTH) 572 #if defined(REINTERPRET_INPUT_AS_3D) 578 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
580 #else // defined(REINTERPRET_INPUT_AS_3D) 583 lhs_offset += z * lhs_stride_z;
585 #endif // defined(REINTERPRET_INPUT_AS_3D) 591 for(; i <= (
K - K0); i += K0)
600 ARM_MM_K0XN0XM0(M0, N0, K0, a,
b, c);
603 rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
614 ARM_MM_K0XN0XM0(M0, N0, 1, a,
b, c);
622 #if defined(REINTERPRET_OUTPUT_AS_3D) 628 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
630 #else // defined(REINTERPRET_OUTPUT_AS_3D) 633 dst_addr += z * dst_stride_z;
635 #endif // defined(REINTERPRET_OUTPUT_AS_3D) 638 const bool cond_y = y == 0;
639 const bool cond_x = ((x + 1) * N0 >=
N);
643 STORE_BLOCK_BOUNDARY_AWARE(M0, N0,
int, c_lp, dst_addr, dst_stride_y, zout,
PARTIAL_STORE_M0,
PARTIAL_STORE_N0, cond_y, cond_x);
645 #undef RHS_BLOCK_SIZE 650 #if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER) 730 __kernel
void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(
IMAGE_DECLARATION(lhs),
736 #
if defined(REINTERPRET_INPUT_AS_3D)
738 uint lhs_cross_plane_pad
740 #
if defined(REINTERPRET_OUTPUT_AS_3D)
742 uint dst_cross_plane_pad
744 #
if defined(A_OFFSET)
748 #
if defined(B_OFFSET)
752 #
if defined(ADD_BIAS)
756 #
if defined(PER_CHANNEL_QUANTIZATION)
764 #define RHS_BLOCK_SIZE ((K0) * (N0)) 767 #if defined(RHS_INTERLEAVE) 768 #define RHS_OFFSET_X (K0) 769 #define RHS_STEP_X ((K0) * (H0)) 770 #define RHS_STEP_LOOP (1) 771 #else // defined(RHS_INTERLEAVE) 772 #define RHS_OFFSET_X (RHS_BLOCK_SIZE) 773 #define RHS_STEP_X (K0) 774 #define RHS_STEP_LOOP (H0) 775 #endif // defined(RHS_INTERLEAVE) 777 uint x = get_global_id(0);
778 uint y = get_global_id(1);
779 uint z = get_global_id(2);
781 #if defined(DUMMY_WORK_ITEMS) 782 if((x * N0 >=
N) || (y * M0 >=
M))
786 #endif // defined(DUMMY_WORK_ITEMS) 792 uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
794 #if defined(MATRIX_B_DEPTH) 796 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
797 #else // defined(MATRIX_B_DEPTH) 798 rhs_offset += z * rhs_stride_z;
799 #endif // defined(MATRIX_B_DEPTH) 804 #if defined(REINTERPRET_INPUT_AS_3D) 810 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
812 #else // defined(REINTERPRET_INPUT_AS_3D) 815 lhs_offset += z * lhs_stride_z;
817 #endif // defined(REINTERPRET_INPUT_AS_3D) 823 for(; i <= (K - K0); i += K0)
832 ARM_MM_K0XN0XM0(M0, N0, K0, a,
b, c);
835 rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
846 ARM_MM_K0XN0XM0(M0, N0, 1, a,
b, c);
855 #if defined(REINTERPRET_OUTPUT_AS_3D) 861 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
863 #else // defined(REINTERPRET_OUTPUT_AS_3D) 866 dst_addr += z * dst_stride_z;
868 #endif // defined(REINTERPRET_OUTPUT_AS_3D) 876 #if defined(A_OFFSET) 878 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) *
sizeof(int);
880 #if defined(SUM_COL_HAS_BATCHES) 881 sum_col_addr += z * sum_col_stride_y;
882 #endif // defined(SUM_COL_HAS_BATCHES) 884 a_offset_s32 =
VLOAD(N0)(0, (__global
int *)sum_col_addr);
888 #endif // defined(A_OFFSET) 890 #if defined(B_OFFSET) 895 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + (
COMPUTE_M0_START_ROW(y, (uint)M0,
PARTIAL_STORE_M0)) *
sizeof(
int) + z * sum_row_stride_y;
900 #endif // defined(B_OFFSET) 902 #if defined(ADD_BIAS) 904 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) *
sizeof(int);
907 bias_values =
VLOAD(N0)(0, (__global
int *)bias_addr);
909 #endif // defined(ADD_BIAS) 914 #if defined(PER_CHANNEL_QUANTIZATION) 915 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + (x * (uint)N0) *
sizeof(int);
916 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) *
sizeof(int);
919 res_mul =
VLOAD(N0)(0, (__global
int *)result_multipliers_addr);
921 res_shift =
VLOAD(N0)(0, (__global
int *)result_shifts_addr);
924 #else // defined(PER_CHANNEL_QUANTIZATION) 928 #else // RESULT_SHIFT >= 0 930 #endif // RESULT_SHIFT < 0 932 #endif // defined(PER_CHANNEL_QUANTIZATION) 937 #if defined(MIN_BOUND) 939 #endif // defined(MIN_BOUND) 940 #if defined(MAX_BOUND) 942 #endif // defined(MAX_BOUND) 945 const bool cond_y = y == 0;
946 const bool cond_x = ((x + 1) * N0 >=
N);
950 STORE_BLOCK_BOUNDARY_AWARE(M0, N0,
DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout,
PARTIAL_STORE_M0,
PARTIAL_STORE_N0, cond_y, cond_x);
952 #undef RHS_BLOCK_SIZE 956 #endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER) 957 #endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 959 #if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1013 #
if defined(REINTERPRET_INPUT_AS_3D)
1015 uint lhs_cross_plane_pad
1017 #
if defined(REINTERPRET_OUTPUT_AS_3D)
1019 uint dst_cross_plane_pad
1023 uint x = get_global_id(0);
1024 uint y = get_global_id(1);
1025 uint z = get_global_id(2);
1027 #if defined(DUMMY_WORK_ITEMS) 1028 if((x * N0 >=
N) || (y * M0 >=
M))
1032 #endif // defined(DUMMY_WORK_ITEMS) 1038 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 *
sizeof(
DATA_TYPE);
1040 #if defined(MATRIX_B_DEPTH) 1042 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
1043 #else // defined(MATRIX_B_DEPTH) 1044 rhs_offset += z * rhs_stride_z;
1045 #endif // defined(MATRIX_B_DEPTH) 1050 #if defined(REINTERPRET_INPUT_AS_3D) 1056 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
1058 #else // defined(REINTERPRET_INPUT_AS_3D) 1061 lhs_offset += z * lhs_stride_z;
1063 #endif // defined(REINTERPRET_INPUT_AS_3D) 1070 for(; i <= (K - K0); i += K0)
1079 #if(GPU_ARCH == GPU_ARCH_MIDGARD) 1080 ARM_MM_NATIVE_N0XK0XM0(
VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a,
b, c);
1081 #else // GPU_ARCH == GPU_ARCH_MIDGARD 1085 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
1086 #endif // GPU_ARCH == GPU_ARCH_MIDGARD 1090 rhs_offset += K0 * rhs_stride_y;
1103 #if(GPU_ARCH == GPU_ARCH_MIDGARD) 1104 ARM_MM_NATIVE_N0XK0XM0(
VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a,
b, c);
1105 #else // GPU_ARCH == GPU_ARCH_MIDGARD 1109 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
1110 #endif // GPU_ARCH == GPU_ARCH_MIDGARD 1114 rhs_offset += rhs_stride_y;
1121 #if defined(REINTERPRET_OUTPUT_AS_3D) 1127 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
1129 #else // defined(REINTERPRET_OUTPUT_AS_3D) 1132 dst_addr += z * dst_stride_z;
1134 #endif // defined(REINTERPRET_OUTPUT_AS_3D) 1135 const bool cond_y = y == 0;
1136 const bool cond_x = ((x + 1) * N0 >=
N);
1140 STORE_BLOCK_BOUNDARY_AWARE(M0, N0,
int, res, dst_addr, dst_stride_y, zout,
PARTIAL_STORE_M0,
PARTIAL_STORE_N0, cond_y, cond_x);
1142 #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) 1180 ACC_DATA_TYPE sum_row = 0;
1182 __global const
DATA_TYPE *matrix_a = (__global const
DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1187 for(; i <= ((
int)COLS_A - 16); i += 16)
1196 for(; i < COLS_A; ++i)
1198 sum_row += (ACC_DATA_TYPE)matrix_a[i];
1201 sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
1204 sum_row *= (int)SCALAR;
1205 #endif // defined(SCALAR) 1206 *((__global
int *)dst.
ptr) = (int)sum_row;
1209 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1243 ACC_DATA_TYPE sum_row = 0;
1245 __global
const DATA_TYPE *matrix_a = (__global
const DATA_TYPE *)(src.
ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1250 for(; i <= ((int)COLS_A - 32); i += 32)
1253 a0 = vload16(0, matrix_a + i);
1256 sum_row += arm_dot(a0.s4567, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1257 sum_row += arm_dot(a0.s89AB, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1258 sum_row += arm_dot(a0.sCDEF, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1260 a0 = vload16(1, matrix_a + i);
1262 sum_row += arm_dot(a0.s0123, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1263 sum_row += arm_dot(a0.s4567, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1264 sum_row += arm_dot(a0.s89AB, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1265 sum_row += arm_dot(a0.sCDEF, (
VEC_DATA_TYPE(DATA_TYPE, 4))(1));
1269 for(; i < COLS_A; ++i)
1271 sum_row += (ACC_DATA_TYPE)matrix_a[i];
1275 sum_row *= (int)SCALAR;
1276 #endif // defined(SCALAR) 1277 *((__global
int *)dst.
ptr) = (int)sum_row;
1279 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) 1280 #endif // defined(COLS_A) 1282 #if defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 1316 const uint y = get_global_id(1);
1318 __global
const DATA_TYPE *matrix_b = (__global
const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) + y * src_step_y + y * src_stride_z);
1319 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs *
sizeof(int) + y * dst_stride_y;
1326 for(; i <= ((int)ROWS_B - 4); i += 4)
1340 matrix_b += 4 * src_stride_y;
1344 for(; i < (int)ROWS_B; ++i)
1351 matrix_b += src_stride_y;
1356 #endif // defined(SCALAR) 1362 #endif // defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 1364 #endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE) 1366 #if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 1368 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) 1402 inline VEC_INT offset_contribution(
1406 #
if defined(A_OFFSET)
1410 #
if defined(B_OFFSET)
1414 #
if defined(ADD_BIAS)
1424 #if defined(DEPTH_INPUT3D) 1425 batch_id /= (int)DEPTH_INPUT3D;
1426 #endif // defined(DEPTH_INPUT3D) 1428 #if defined(A_OFFSET) 1430 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x *
sizeof(int);
1433 #if defined(SUM_COL_HAS_BATCHES) 1434 a_offset_s32 =
VLOAD(
VEC_SIZE)(0, (__global
int *)(sum_col_addr + batch_id * sum_col_stride_y));
1435 #else // defined(SUM_COL_HAS_BATCHES) 1436 a_offset_s32 =
VLOAD(
VEC_SIZE)(0, (__global
int *)sum_col_addr);
1437 #endif // defined(SUM_COL_HAS_BATCHES) 1439 a_offset_s32 *= (
VEC_INT)A_OFFSET;
1440 #endif // defined(A_OFFSET) 1442 #if defined(B_OFFSET) 1444 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y *
sizeof(int);
1447 #if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 1448 b_offset_s32 = (
VEC_INT) * (((__global
int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
1449 #else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 1450 b_offset_s32 = (
VEC_INT) * (((__global
int *)(sum_row_addr + batch_id * sum_row_stride_y)));
1451 #endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D) 1452 b_offset_s32 *= (
VEC_INT)B_OFFSET;
1453 #endif // defined(B_OFFSET) 1455 #if defined(ADD_BIAS) 1457 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
1460 b_offset_s32 += (
VEC_INT)biases_values;
1461 #endif // defined(ADD_BIAS) 1463 return (
VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
1511 #
if defined(A_OFFSET)
1515 #
if defined(B_OFFSET)
1519 #
if defined(ADD_BIAS)
1526 const int y = get_global_id(1);
1527 const int z = get_global_id(2);
1530 VEC_INT offset_term_s32 = offset_contribution(
1532 #
if defined(A_OFFSET)
1539 sum_col_offset_first_element_in_bytes
1541 #
if defined(B_OFFSET)
1548 sum_row_offset_first_element_in_bytes
1550 #
if defined(ADD_BIAS)
1555 biases_offset_first_element_in_bytes
1559 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x *
sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1564 in_s32_0 += offset_term_s32;
1570 #if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE) 1650 #
if defined(A_OFFSET)
1654 #
if defined(B_OFFSET)
1659 #
if defined(ADD_BIAS)
1663 #
if defined(PER_CHANNEL_QUANTIZATION)
1671 const int y = get_global_id(1);
1672 const int z = get_global_id(2);
1674 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1677 VEC_INT offset_term_s32 = offset_contribution(
1679 #
if defined(A_OFFSET)
1686 sum_col_offset_first_element_in_bytes
1688 #
if defined(B_OFFSET)
1695 sum_row_offset_first_element_in_bytes
1697 #
if defined(ADD_BIAS)
1702 biases_offset_first_element_in_bytes
1706 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x *
sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1711 in_s32 += offset_term_s32;
1716 in_s32 += (
VEC_INT)RESULT_OFFSET;
1719 #if defined(PER_CHANNEL_QUANTIZATION) 1720 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x *
sizeof(int);
1721 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x *
sizeof(int);
1722 VEC_INT result_multipliers_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)result_multipliers_addr);
1725 in_s32 *= result_multipliers_values;
1726 in_s32 >>= result_shifts_values;
1727 #else // defined(PER_CHANNEL_QUANTIZATION) 1728 in_s32 *= RESULT_MULTIPLIER;
1730 in_s32 >>= RESULT_SHIFT;
1731 #endif // defined(PER_CHANNEL_QUANTIZATION) 1736 #if defined(MIN_BOUND) 1738 #endif // defined(MIN_BOUND) 1739 #if defined(MAX_BOUND) 1741 #endif // defined(MAX_BOUND) 1744 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1825 __kernel
void gemmlowp_offset_contribution_quantize_down_fixedpoint(
TENSOR3D_DECLARATION(mm_result)
1826 #
if defined(A_OFFSET)
1830 #
if defined(B_OFFSET)
1835 #
if defined(ADD_BIAS)
1839 #
if defined(PER_CHANNEL_QUANTIZATION)
1847 const int y = get_global_id(1);
1848 const int z = get_global_id(2);
1851 VEC_INT offset_term_s32 = offset_contribution(
1853 #
if defined(A_OFFSET)
1860 sum_col_offset_first_element_in_bytes
1862 #
if defined(B_OFFSET)
1869 sum_row_offset_first_element_in_bytes
1871 #
if defined(ADD_BIAS)
1876 biases_offset_first_element_in_bytes
1880 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x *
sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1882 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1887 in_s32 += offset_term_s32;
1892 #if defined(PER_CHANNEL_QUANTIZATION) 1893 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x *
sizeof(int);
1894 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x *
sizeof(int);
1895 VEC_INT result_multipliers_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)result_multipliers_addr);
1900 in_s32 =
select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
1901 #else // defined(PER_CHANNEL_QUANTIZATION) 1903 #if RESULT_SHIFT < 0 1905 #else // RESULT_SHIFT >= 0 1907 #endif // RESULT_SHIFT < 0 1909 #endif // defined(PER_CHANNEL_QUANTIZATION) 1912 in_s32 += (
VEC_INT)RESULT_OFFSET;
1917 #if defined(MIN_BOUND) 1919 #endif // defined(MIN_BOUND) 1920 #if defined(MAX_BOUND) 1922 #endif // defined(MAX_BOUND) 1925 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1927 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE) 1931 #endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) 1933 #if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) 1978 #
if defined(ADD_BIAS)
1985 int y = get_global_id(1);
1986 int z = get_global_id(2);
1988 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
1990 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1993 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
1995 #if defined(ADD_BIAS) 1997 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
2000 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
2001 input_values += biases_values;
2002 #endif // defined(ADD_BIAS) 2008 input_values *= RESULT_MULT_INT;
2010 #if RESULT_SHIFT < 0 2011 input_values >>= -RESULT_SHIFT;
2012 #else // RESULT_SHIFT >= 0 2013 input_values >>= RESULT_SHIFT;
2014 #endif // RESULT_SHIFT < 0 2019 #if defined(MIN_BOUND) 2021 #endif // defined(MIN_BOUND) 2022 #if defined(MAX_BOUND) 2024 #endif // defined(MAX_BOUND) 2027 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
2029 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT) 2031 #if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) 2077 #
if defined(ADD_BIAS)
2084 int y = get_global_id(1);
2085 int z = get_global_id(2);
2087 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
2089 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2092 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
2094 #if defined(ADD_BIAS) 2096 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
2099 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
2100 input_values += biases_values;
2101 #endif // defined(ADD_BIAS) 2104 #if RESULT_SHIFT < 0 2106 #else // RESULT_SHIFT >= 0 2108 #endif // RESULT_SHIFT < 0 2116 #if defined(MIN_BOUND) 2118 #endif // defined(MIN_BOUND) 2119 #if defined(MAX_BOUND) 2121 #endif // defined(MAX_BOUND) 2124 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
2126 #endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) 2128 #if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) 2171 __kernel
void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(
TENSOR3D_DECLARATION(src),
2172 #
if defined(ADD_BIAS)
2179 int y = get_global_id(1);
2180 int z = get_global_id(2);
2182 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
2184 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(short) + y * dst_stride_y + z * dst_stride_z;
2187 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
2189 #if defined(ADD_BIAS) 2191 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
2194 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
2195 input_values += biases_values;
2196 #endif // defined(ADD_BIAS) 2199 #if RESULT_SHIFT < 0 2201 #else // RESULT_SHIFT >= 0 2203 #endif // RESULT_SHIFT < 0 2208 #if defined(MIN_BOUND) 2210 #endif // defined(MIN_BOUND) 2211 #if defined(MAX_BOUND) 2213 #endif // defined(MAX_BOUND) 2218 #endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT) 2220 #if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET) 2268 #
if defined(ADD_BIAS)
2271 #
if defined(DST_HEIGHT)
2273 #else // defined(DST_HEIGHT) 2279 int y = get_global_id(1);
2280 int z = get_global_id(2);
2282 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
2284 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2287 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
2289 #if defined(ADD_BIAS) 2291 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
2294 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
2295 input_values += (int4)biases_values;
2296 #endif // defined(ADD_BIAS) 2299 float4 input_values_f = convert_float4(input_values);
2300 input_values_f =
round(input_values_f * (
float)REAL_MULTIPLIER + (
float)OUTPUT_OFFSET);
2305 #if defined(MIN_BOUND) 2307 #endif // defined(MIN_BOUND) 2308 #if defined(MAX_BOUND) 2310 #endif // defined(MAX_BOUND) 2313 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
2315 #endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL)
#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL)
#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT)
#define CONVERT_TO_IMAGE_STRUCT(name)
#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT)
#define IMAGE_DECLARATION(name)
#define CONVERT_SAT(a, b)
#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
for(size_t k=0;k< _target.size();++k)
#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL)
Structure to hold 3D tensor information.
#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL)
SimpleTensor< float > src
#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B)
#define VECTOR_DECLARATION(name)
#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT)
#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT)
#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC)
#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
Structure to hold Image information.
int round(float x, RoundingPolicy rounding_policy)
Return a rounded value of x.
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define CONVERT_TO_TENSOR3D_STRUCT(name)
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size)
#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL)
#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE)
Create transposed vectors form the given source vectors.
#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT)
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
#define TENSOR4D_DECLARATION(name)
__global uchar * ptr
Pointer to the starting postion of the buffer.
#define TENSOR3D_DECLARATION(name)
#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0)
#define VEC_DATA_TYPE(type, size)