29 #if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
31 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
32 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
33 #define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
34 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
35 #define ARM_DOT(x, y, val) val += arm_dot((x), (y));
36 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
37 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
39 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
41 #define ARM_DOT1(a, b, c) \
43 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
45 #define ARM_DOT2(a, b, c) \
47 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
49 #define ARM_DOT3(a, b, c) \
51 ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
53 #define ARM_DOT4(a, b, c) \
57 #define ARM_DOT8(a, b, c) \
59 ARM_DOT4((a.lo), (b.lo), c); \
60 ARM_DOT4((a.hi), (b.hi), c); \
62 #define ARM_DOT16(a, b, c) \
64 ARM_DOT8((a.lo), (b.lo), c); \
65 ARM_DOT8((a.hi), (b.hi), c); \
68 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
71 #define ARM_DOT1(a, b, c) \
73 c += (ACC_DATA_TYPE)a * b; \
75 #define ARM_DOT2(a, b, c) \
77 c += (ACC_DATA_TYPE)a.s0 * b.s0; \
78 c += (ACC_DATA_TYPE)a.s1 * b.s1; \
80 #define ARM_DOT3(a, b, c) \
83 c += (ACC_DATA_TYPE)a.s2 * b.s2; \
85 #define ARM_DOT4(a, b, c) \
88 c += (ACC_DATA_TYPE)a.s3 * b.s3; \
90 #define ARM_DOT8(a, b, c) \
92 ARM_DOT4((a.lo), (b.lo), c); \
93 ARM_DOT4((a.hi), (b.hi), c); \
95 #define ARM_DOT16(a, b, c) \
97 ARM_DOT8((a.lo), (b.lo), c); \
98 ARM_DOT8((a.hi), (b.hi), c); \
100 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
103 #define ARM_DOT_K0X1(k0, a, b, c) \
105 ARM_DOT_K0(k0, (a), (b##0), (c)); \
107 #define ARM_DOT_K0X2(k0, a, b, c) \
109 ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
110 ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
112 #define ARM_DOT_K0X3(k0, a, b, c) \
114 ARM_DOT_K0X2(k0, a, b, c); \
115 ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
117 #define ARM_DOT_K0X4(k0, a, b, c) \
119 ARM_DOT_K0X3(k0, a, b, c); \
120 ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
122 #define ARM_DOT_K0X8(k0, a, b, c) \
124 ARM_DOT_K0X4(k0, a, b, c); \
125 ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
126 ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
127 ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
128 ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
130 #define ARM_DOT_K0X16(k0, a, b, c) \
132 ARM_DOT_K0X8(k0, a, b, c); \
133 ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
134 ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
135 ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
136 ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
137 ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
138 ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
139 ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
140 ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
144 #define ARM_MM_K0XN0X1(n0, k0, a, b, c) \
146 ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
148 #define ARM_MM_K0XN0X2(n0, k0, a, b, c) \
150 ARM_MM_K0XN0X1(n0, k0, a, b, c); \
151 ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
153 #define ARM_MM_K0XN0X3(n0, k0, a, b, c) \
155 ARM_MM_K0XN0X2(n0, k0, a, b, c); \
156 ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
158 #define ARM_MM_K0XN0X4(n0, k0, a, b, c) \
160 ARM_MM_K0XN0X3(n0, k0, a, b, c); \
161 ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
163 #define ARM_MM_K0XN0X5(n0, k0, a, b, c) \
165 ARM_MM_K0XN0X4(n0, k0, a, b, c); \
166 ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
168 #define ARM_MM_K0XN0X6(n0, k0, a, b, c) \
170 ARM_MM_K0XN0X5(n0, k0, a, b, c); \
171 ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
173 #define ARM_MM_K0XN0X7(n0, k0, a, b, c) \
175 ARM_MM_K0XN0X6(n0, k0, a, b, c); \
176 ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
178 #define ARM_MM_K0XN0X8(n0, k0, a, b, c) \
180 ARM_MM_K0XN0X7(n0, k0, a, b, c); \
181 ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
184 #define ARM_DOT_K0(k0, a, b, c) \
186 CONCAT(ARM_DOT, k0) \
190 #define ARM_DOT_K0XN0(n0, k0, a, b, c) \
192 CONCAT(ARM_DOT_K0X, n0) \
196 #define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
198 CONCAT(ARM_MM_K0XN0X, m0) \
203 #define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) \
205 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
207 #define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c) \
209 c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
210 c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
212 #define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c) \
214 ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c); \
215 c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
217 #define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c) \
219 ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c); \
220 c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
222 #define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c) \
224 ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c); \
225 c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
226 c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
227 c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
228 c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
230 #define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c) \
232 ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c); \
233 c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
234 c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
235 c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
236 c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
237 c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
238 c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
239 c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
240 c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
243 #define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \
245 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
247 #define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \
249 ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c); \
250 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
252 #define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \
254 ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c); \
255 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
257 #define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \
259 ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c); \
260 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
262 #define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \
264 ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c); \
265 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
267 #define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \
269 ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c); \
270 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
272 #define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \
274 ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c); \
275 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
277 #define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \
279 ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c); \
280 ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
282 #define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
284 CONCAT(ARM_MUL_N0X, k0) \
285 (VECTOR_ACC_TYPE, (a), b, (c)); \
287 #define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
289 CONCAT(ARM_MM_NATIVE_N0XK0X, m0) \
290 (VECTOR_ACC_TYPE, k0, a, b, c); \
293 #if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
351 #
if defined(REINTERPRET_OUTPUT_AS_3D)
353 uint dst_cross_plane_pad
358 #define LHS_BLOCK_SIZE ((K0) * (M0))
360 #if defined(LHS_INTERLEAVE)
361 #define LHS_OFFSET_X (K0)
362 #define LHS_STEP_X ((K0) * (V0))
363 #define LHS_STEP_LOOP (1)
364 #else // defined(INTERLEAVE)
365 #define LHS_OFFSET_X (LHS_BLOCK_SIZE)
366 #define LHS_STEP_X (K0)
367 #define LHS_STEP_LOOP (V0)
368 #endif // defined(INTERLEAVE)
371 #define RHS_BLOCK_SIZE ((K0) * (N0))
374 #if defined(RHS_INTERLEAVE)
375 #define RHS_OFFSET_X (K0)
376 #define RHS_STEP_X ((K0) * (H0))
377 #define RHS_STEP_LOOP (1)
378 #else // defined(RHS_INTERLEAVE)
379 #define RHS_OFFSET_X (RHS_BLOCK_SIZE)
380 #define RHS_STEP_X (K0)
381 #define RHS_STEP_LOOP (H0)
382 #endif // defined(RHS_INTERLEAVE)
384 uint x = get_global_id(0);
385 uint y = get_global_id(1);
386 uint z = get_global_id(2);
388 #if defined(DUMMY_WORK_ITEMS)
389 if((x * N0 >=
N) || (y * M0 >=
M))
393 #endif // defined(DUMMY_WORK_ITEMS)
396 __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
399 __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
401 #if defined(MATRIX_B_DEPTH)
403 rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
404 #else // defined(MATRIX_B_DEPTH)
405 rhs_addr += z * rhs_stride_z;
406 #endif // defined(MATRIX_B_DEPTH)
414 for(
int i = 0; i < k; i += K0)
417 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
420 LOAD_BLOCK(N0, K0, DATA_TYPE,
b, rhs_addr, 0, RHS_STEP_X, zrhs);
423 ARM_MM_K0XN0XM0(M0, N0, K0, a,
b, c);
426 lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
427 rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
430 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 *
sizeof(
int)) + (y * (uint)M0 * dst_stride_y);
434 #if defined(REINTERPRET_OUTPUT_AS_3D)
436 CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
440 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
442 #else // defined(REINTERPRET_OUTPUT_AS_3D)
445 dst_addr += z * dst_stride_z;
447 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
450 const bool cond_y = ((get_global_id(1) + 1) * M0 >=
M);
451 const bool cond_x = ((get_global_id(0) + 1) * N0 >=
N);
455 STORE_BLOCK_BOUNDARY_AWARE(M0, N0,
int, c_lp, dst_addr, dst_stride_y, zout,
PARTIAL_STORE_M0,
PARTIAL_STORE_N0, cond_y, cond_x);
457 #undef LHS_BLOCK_SIZE
460 #undef RHS_BLOCK_SIZE
464 #endif // defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
466 #if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
467 #if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
468 #define FUSED_OUTPUT_STAGE_FIXED_POINT
469 #endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
550 #if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
551 __kernel
void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
552 #elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
553 __kernel
void gemmlowp_mm_reshaped_only_rhs_t
554 #endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
561 #if defined(REINTERPRET_INPUT_AS_3D)
563 uint lhs_cross_plane_pad
564 #endif // REINTERPRET_INPUT_AS_3D
565 #if defined(REINTERPRET_OUTPUT_AS_3D)
567 uint dst_cross_plane_pad
568 #endif // REINTERPRET_OUTPUT_AS_3D
569 #if defined(A_OFFSET)
572 #endif // defined(A_OFFSET)
573 #if defined(B_OFFSET)
576 #endif // defined(B_OFFSET)
577 #if defined(ADD_BIAS)
580 #endif // defined(ADD_BIAS)
581 #if defined(PER_CHANNEL_QUANTIZATION)
585 #endif // defined(PER_CHANNEL_QUANTIZATION)
589 #define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
590 #define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
593 #if defined(RHS_INTERLEAVE)
594 #define RHS_OFFSET_X (K0)
595 #define RHS_STEP_X (K0 * H0)
596 #else // defined(RHS_INTERLEAVE)
597 #define RHS_OFFSET_X (K0 * N0)
598 #define RHS_STEP_X (K0)
599 #endif // defined(RHS_INTERLEAVE)
600 #define RHS_STEP_LOOP (N0 * K0 * H0)
607 #if defined(DUMMY_WORK_ITEMS)
608 if((xo >=
N) || (y >=
M))
612 #endif // defined(DUMMY_WORK_ITEMS)
615 uint lhs_y = y + z * FULL_LHS_HEIGHT;
618 uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
619 uint rhs_offset_y = (x / H0) * rhs_stride_y;
621 #if defined(MATRIX_B_DEPTH)
623 rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
624 #else // defined(MATRIX_B_DEPTH)
625 rhs_offset_y += z * rhs_stride_z;
626 #endif // defined(MATRIX_B_DEPTH)
629 TILE(ACC_DATA_TYPE, M0, N0, c);
636 for(; i <= (
K - K0); i += K0)
638 TILE(DATA_TYPE, M0, K0, a);
639 TILE(DATA_TYPE, N0, K0,
b);
642 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
647 b[_i].v =
VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
651 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a,
b, c);
653 rhs_offset_x += RHS_STEP_LOOP;
661 TILE(DATA_TYPE, M0, 1, a);
662 TILE(DATA_TYPE, N0, 1,
b);
665 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
669 b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
672 T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a,
b, c);
676 #endif // ((K % K0) != 0)
678 #if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
680 TILE(
int, M0, N0, c_int);
681 TILE(
int, M0, N0, offset_s32);
692 #
if defined(A_OFFSET)
694 #if defined(SUM_COL_HAS_BATCHES)
696 #else // defined(SUM_COL_HAS_BATCHES)
698 #endif // defined(SUM_COL_HAS_BATCHES)
699 TILE(
int, 1, N0, a_offset_s32);
701 T_LOAD(
int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
703 a_offset_s32[0].v *= A_OFFSET;
706 #endif // defined(A_OFFSET)
708 #if defined(B_OFFSET)
713 TILE(
int, M0, N0, b_offset_s32);
715 T_LOAD(
int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y /
sizeof(
int)), 0, 1, sum_row_stride_x, b_offset_s32);
719 offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
724 #
if defined(ADD_BIAS)
728 T_LOAD(
int, 1, N0, BUFFER, biases, xo, 0, 1, 0,
bias);
731 #endif // defined(ADD_BIAS)
735 c_int[i].v += offset_s32[i].v;
738 TILE(DATA_TYPE, M0, N0, c_lp);
741 #if defined(PER_CHANNEL_QUANTIZATION)
742 TILE(
int, 1, N0, res_mul);
743 TILE(
int, 1, N0, res_shift);
745 T_LOAD(
int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
746 T_LOAD(
int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
748 T_QUANTIZE8(
int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
749 #else // defined(PER_CHANNEL_QUANTIZATION)
750 T_QUANTIZE8(
int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
751 #endif // defined(PER_CHANNEL_QUANTIZATION)
753 #if defined(MIN_BOUND)
756 c_lp[i].v = max(c_lp[i].v, (
VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
759 #
if defined(MAX_BOUND)
762 c_lp[i].v = min(c_lp[i].v, (
VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
767 TILE(
int, M0, N0, c_lp);
775 TILE(uint, M0, 1, dst_indirect_y);
779 #if defined(REINTERPRET_OUTPUT_AS_3D)
780 dst_indirect_y[i].v = (uint)min((
int)((y + i) % HEIGHT_GEMM3D), (
int)HEIGHT_GEMM3D - 1);
781 dst_indirect_y[i].v += (uint)min((
int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
782 dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
783 #else // (REINTERPRET_OUTPUT_AS_3D)
784 dst_indirect_y[i].v = (uint)min((
int)y + i, (int)
M - 1) + z *FULL_DST_HEIGHT;
785 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
790 #if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
791 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
792 #else // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
793 T_STORE_INDIRECT_WIDTH_SELECT(
int, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
794 #endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
800 #endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
802 #if defined(GEMMLOWP_MM_NATIVE)
856 #
if defined(REINTERPRET_INPUT_AS_3D)
858 uint lhs_cross_plane_pad
860 #
if defined(REINTERPRET_OUTPUT_AS_3D)
862 uint dst_cross_plane_pad
866 uint x = get_global_id(0);
867 uint y = get_global_id(1);
868 uint z = get_global_id(2);
870 #if defined(DUMMY_WORK_ITEMS)
871 if((x * N0 >=
N) || (y * M0 >=
M))
875 #endif // defined(DUMMY_WORK_ITEMS)
881 uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 *
sizeof(DATA_TYPE);
883 #if defined(MATRIX_B_DEPTH)
885 rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
886 #else // defined(MATRIX_B_DEPTH)
887 rhs_offset += z * rhs_stride_z;
888 #endif // defined(MATRIX_B_DEPTH)
893 #if defined(REINTERPRET_INPUT_AS_3D)
899 lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
901 #else // defined(REINTERPRET_INPUT_AS_3D)
904 lhs_offset += z * lhs_stride_z;
906 #endif // defined(REINTERPRET_INPUT_AS_3D)
913 for(; i <= (
K - K0); i += K0)
916 LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
919 LOAD_BLOCK(K0, N0, DATA_TYPE,
b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
922 #if(GPU_ARCH == GPU_ARCH_MIDGARD)
923 ARM_MM_NATIVE_N0XK0XM0(
VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a,
b, c);
924 #else // GPU_ARCH == GPU_ARCH_MIDGARD
928 ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
929 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
933 rhs_offset += K0 * rhs_stride_y;
940 LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
943 LOAD_BLOCK(1, N0, DATA_TYPE,
b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
946 #if(GPU_ARCH == GPU_ARCH_MIDGARD)
947 ARM_MM_NATIVE_N0XK0XM0(
VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a,
b, c);
948 #else // GPU_ARCH == GPU_ARCH_MIDGARD
952 ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
953 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
957 rhs_offset += rhs_stride_y;
964 #if defined(REINTERPRET_OUTPUT_AS_3D)
970 dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
972 #else // defined(REINTERPRET_OUTPUT_AS_3D)
975 dst_addr += z * dst_stride_z;
977 #endif // defined(REINTERPRET_OUTPUT_AS_3D)
978 const bool cond_y = y == 0;
979 const bool cond_x = ((x + 1) * N0 >=
N);
983 STORE_BLOCK_BOUNDARY_AWARE(M0, N0,
int, res, dst_addr, dst_stride_y, zout,
PARTIAL_STORE_M0,
PARTIAL_STORE_N0, cond_y, cond_x);
985 #endif // defined(GEMMLOWP_MM_NATIVE)
987 #if defined(GEMMLOWP_MATRIX_A_REDUCTION)
1023 ACC_DATA_TYPE sum_row = 0;
1025 __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(
src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1030 for(; i <= ((
int)COLS_A - 16); i += 16)
1032 const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
1039 for(; i < COLS_A; ++i)
1041 sum_row += (ACC_DATA_TYPE)matrix_a[i];
1044 sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
1047 sum_row *= (int)SCALAR;
1048 #endif // defined(SCALAR)
1049 *((__global
int *)
dst.ptr) = (int)sum_row;
1051 #endif // defined(GEMMLOWP_MATRIX_A_REDUCTION)
1053 #if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
1087 ACC_DATA_TYPE sum_row = 0;
1089 __global
const DATA_TYPE *matrix_a = (__global
const DATA_TYPE *)(
src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
1094 for(; i <= ((int)COLS_A - 32); i += 32)
1097 a0 = vload16(0, matrix_a + i);
1104 a0 = vload16(1, matrix_a + i);
1113 for(; i < COLS_A; ++i)
1115 sum_row += (ACC_DATA_TYPE)matrix_a[i];
1119 sum_row *= (int)SCALAR;
1120 #endif // defined(SCALAR)
1121 *((__global
int *)
dst.ptr) = (int)sum_row;
1123 #endif // defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
1125 #if defined(GEMMLOWP_MATRIX_B_REDUCTION)
1159 const uint y = get_global_id(1);
1161 __global
const DATA_TYPE *matrix_b = (__global
const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
1162 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs *
sizeof(int) + y * dst_stride_y;
1169 for(; i <= ((int)ROWS_B - 4); i += 4)
1183 matrix_b += 4 * src_stride_y;
1187 for(; i < (int)ROWS_B; ++i)
1194 matrix_b += src_stride_y;
1199 #endif // defined(SCALAR)
1205 #endif // defined(GEMMLOWP_MATRIX_B_REDUCTION)
1207 #endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
1209 #if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
1211 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
1245 inline VEC_INT offset_contribution(
1249 #
if defined(A_OFFSET)
1253 #
if defined(B_OFFSET)
1257 #
if defined(ADD_BIAS)
1263 VEC_INT a_offset_s32 = (VEC_INT)0;
1264 VEC_INT b_offset_s32 = (VEC_INT)0;
1267 #if defined(DEPTH_INPUT3D)
1268 batch_id /= (int)DEPTH_INPUT3D;
1269 #endif // defined(DEPTH_INPUT3D)
1271 #if defined(A_OFFSET)
1273 __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x *
sizeof(int);
1276 #if defined(SUM_COL_HAS_BATCHES)
1277 a_offset_s32 =
VLOAD(
VEC_SIZE)(0, (__global
int *)(sum_col_addr + batch_id * sum_col_stride_y));
1278 #else // defined(SUM_COL_HAS_BATCHES)
1279 a_offset_s32 =
VLOAD(
VEC_SIZE)(0, (__global
int *)sum_col_addr);
1280 #endif // defined(SUM_COL_HAS_BATCHES)
1282 a_offset_s32 *= (VEC_INT)A_OFFSET;
1283 #endif // defined(A_OFFSET)
1285 #if defined(B_OFFSET)
1287 __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y *
sizeof(int);
1290 #if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
1291 b_offset_s32 = (VEC_INT) * (((__global
int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
1292 #else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
1293 b_offset_s32 = (VEC_INT) * (((__global
int *)(sum_row_addr + batch_id * sum_row_stride_y)));
1294 #endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
1295 b_offset_s32 *= (VEC_INT)B_OFFSET;
1296 #endif // defined(B_OFFSET)
1298 #if defined(ADD_BIAS)
1300 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
1302 VEC_INT biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
1303 b_offset_s32 += (VEC_INT)biases_values;
1304 #endif // defined(ADD_BIAS)
1306 return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
1309 #if defined(GEMMLOWP_OFFSET_CONTRIBUTION)
1355 #
if defined(A_OFFSET)
1359 #
if defined(B_OFFSET)
1363 #
if defined(ADD_BIAS)
1370 const int y = get_global_id(1);
1371 const int z = get_global_id(2);
1374 VEC_INT offset_term_s32 = offset_contribution(
1376 #
if defined(A_OFFSET)
1383 sum_col_offset_first_element_in_bytes
1385 #
if defined(B_OFFSET)
1392 sum_row_offset_first_element_in_bytes
1394 #
if defined(ADD_BIAS)
1399 biases_offset_first_element_in_bytes
1403 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x *
sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1405 VEC_INT in_s32_0 =
VLOAD(
VEC_SIZE)(0, (__global
int *)mm_result_addr);
1408 in_s32_0 += offset_term_s32;
1413 #endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION)
1415 #if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
1495 #
if defined(A_OFFSET)
1499 #
if defined(B_OFFSET)
1504 #
if defined(ADD_BIAS)
1508 #
if defined(PER_CHANNEL_QUANTIZATION)
1516 const int y = get_global_id(1);
1517 const int z = get_global_id(2);
1519 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1522 VEC_INT offset_term_s32 = offset_contribution(
1524 #
if defined(A_OFFSET)
1531 sum_col_offset_first_element_in_bytes
1533 #
if defined(B_OFFSET)
1540 sum_row_offset_first_element_in_bytes
1542 #
if defined(ADD_BIAS)
1547 biases_offset_first_element_in_bytes
1551 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x *
sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1553 VEC_INT in_s32 =
VLOAD(
VEC_SIZE)(0, (__global
int *)mm_result_addr);
1556 in_s32 += offset_term_s32;
1561 in_s32 += (VEC_INT)RESULT_OFFSET;
1564 #if defined(PER_CHANNEL_QUANTIZATION)
1565 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x *
sizeof(int);
1566 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x *
sizeof(int);
1567 VEC_INT result_multipliers_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)result_multipliers_addr);
1568 VEC_INT result_shifts_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)result_shifts_addr);
1570 in_s32 *= result_multipliers_values;
1571 in_s32 >>= result_shifts_values;
1572 #else // defined(PER_CHANNEL_QUANTIZATION)
1573 in_s32 *= RESULT_MULTIPLIER;
1575 in_s32 >>= RESULT_SHIFT;
1576 #endif // defined(PER_CHANNEL_QUANTIZATION)
1581 #if defined(MIN_BOUND)
1583 #endif // defined(MIN_BOUND)
1584 #if defined(MAX_BOUND)
1586 #endif // defined(MAX_BOUND)
1589 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1591 #endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
1593 #if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
1672 __kernel
void gemmlowp_offset_contribution_quantize_down_fixedpoint(
TENSOR3D_DECLARATION(mm_result)
1673 #
if defined(A_OFFSET)
1677 #
if defined(B_OFFSET)
1682 #
if defined(ADD_BIAS)
1686 #
if defined(PER_CHANNEL_QUANTIZATION)
1694 const int y = get_global_id(1);
1695 const int z = get_global_id(2);
1698 VEC_INT offset_term_s32 = offset_contribution(
1700 #
if defined(A_OFFSET)
1707 sum_col_offset_first_element_in_bytes
1709 #
if defined(B_OFFSET)
1716 sum_row_offset_first_element_in_bytes
1718 #
if defined(ADD_BIAS)
1723 biases_offset_first_element_in_bytes
1727 __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x *
sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
1729 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1731 VEC_INT in_s32 =
VLOAD(
VEC_SIZE)(0, (__global
int *)mm_result_addr);
1734 in_s32 += offset_term_s32;
1739 #if defined(PER_CHANNEL_QUANTIZATION)
1740 __global uchar *result_multipliers_addr = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x *
sizeof(int);
1741 __global uchar *result_shifts_addr = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x *
sizeof(int);
1742 VEC_INT result_multipliers_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)result_multipliers_addr);
1743 VEC_INT result_shifts_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)result_shifts_addr);
1747 in_s32 =
select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
1748 #else // defined(PER_CHANNEL_QUANTIZATION)
1750 #if RESULT_SHIFT < 0
1752 #else // RESULT_SHIFT >= 0
1754 #endif // RESULT_SHIFT < 0
1756 #endif // defined(PER_CHANNEL_QUANTIZATION)
1759 in_s32 += (VEC_INT)RESULT_OFFSET;
1764 #if defined(MIN_BOUND)
1766 #endif // defined(MIN_BOUND)
1767 #if defined(MAX_BOUND)
1769 #endif // defined(MAX_BOUND)
1772 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1774 #endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
1778 #endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
1780 #if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
1825 #
if defined(ADD_BIAS)
1832 int y = get_global_id(1);
1833 int z = get_global_id(2);
1835 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
1837 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1840 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
1842 #if defined(ADD_BIAS)
1844 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
1847 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
1848 input_values += biases_values;
1849 #endif // defined(ADD_BIAS)
1855 input_values *= RESULT_MULT_INT;
1857 #if RESULT_SHIFT < 0
1858 input_values >>= -RESULT_SHIFT;
1859 #else // RESULT_SHIFT >= 0
1860 input_values >>= RESULT_SHIFT;
1861 #endif // RESULT_SHIFT < 0
1866 #if defined(MIN_BOUND)
1868 #endif // defined(MIN_BOUND)
1869 #if defined(MAX_BOUND)
1871 #endif // defined(MAX_BOUND)
1874 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1876 #endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
1878 #if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
1924 #
if defined(ADD_BIAS)
1931 int y = get_global_id(1);
1932 int z = get_global_id(2);
1934 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
1936 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
1939 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
1941 #if defined(ADD_BIAS)
1943 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
1946 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
1947 input_values += biases_values;
1948 #endif // defined(ADD_BIAS)
1951 #if RESULT_SHIFT < 0
1953 #else // RESULT_SHIFT >= 0
1955 #endif // RESULT_SHIFT < 0
1963 #if defined(MIN_BOUND)
1965 #endif // defined(MIN_BOUND)
1966 #if defined(MAX_BOUND)
1968 #endif // defined(MAX_BOUND)
1971 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
1973 #endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
1975 #if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
2018 #
if defined(ADD_BIAS)
2025 int y = get_global_id(1);
2026 int z = get_global_id(2);
2028 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
2030 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(short) + y * dst_stride_y + z * dst_stride_z;
2033 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
2035 #if defined(ADD_BIAS)
2037 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
2040 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
2041 input_values += biases_values;
2042 #endif // defined(ADD_BIAS)
2045 #if RESULT_SHIFT < 0
2047 #else // RESULT_SHIFT >= 0
2049 #endif // RESULT_SHIFT < 0
2054 #if defined(MIN_BOUND)
2056 #endif // defined(MIN_BOUND)
2057 #if defined(MAX_BOUND)
2059 #endif // defined(MAX_BOUND)
2064 #endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
2066 #if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
2114 #
if defined(ADD_BIAS)
2117 #
if defined(DST_HEIGHT)
2119 #else // defined(DST_HEIGHT)
2125 int y = get_global_id(1);
2126 int z = get_global_id(2);
2128 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(int) + y * src_stride_y + z * src_stride_z;
2130 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
2133 input_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)src_addr);
2135 #if defined(ADD_BIAS)
2137 __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x *
sizeof(int);
2140 biases_values =
VLOAD(
VEC_SIZE)(0, (__global
int *)bias_addr);
2142 #endif // defined(ADD_BIAS)
2147 input_values_f =
round(input_values_f * (
float)REAL_MULTIPLIER + (
float)OUTPUT_OFFSET);
2152 #if defined(MIN_BOUND)
2154 #endif // defined(MIN_BOUND)
2155 #if defined(MAX_BOUND)
2157 #endif // defined(MAX_BOUND)
2160 STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
2162 #endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)