31 inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes,
TILE(
int, M0, N0, acc), uint x)
33 TILE(
int, 1, N0, bias_tile);
36 T_LOAD(
int, 1, N0, BUFFER,
bias, x, 0, 1, 0, bias_tile);
41 #endif // defined(BIAS)
43 #if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
90 __kernel
void mat_mul_native_quantized_nt_nt(
103 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
104 rhs_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + z * rhs_stride_z;
105 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
108 TILE(
int, M0, N0, acc);
111 acc[i].v =
K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
114 TILE(
int, 1, N0, b_sum);
117 TILE(
int, 1, M0, a_sum);
121 for(k = 0; k <=
K - K0; k += K0)
123 TILE(DATA_TYPE, M0, K0, a);
124 TILE(DATA_TYPE, N0, K0,
b);
137 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
145 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, K0, NT, T, a,
b, acc);
151 a_sum[0].s[i] += (int)a[i].s[j];
159 b_sum[0].s[j] += (int)
b[j].s[i];
163 lhs_offset_first_element_in_bytes += K0 *
sizeof(DATA_TYPE);
164 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
171 TILE(DATA_TYPE, M0, 1, a);
172 TILE(DATA_TYPE, N0, 1,
b);
185 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
191 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, 1, NT, T, a,
b, acc);
197 a_sum[0].s[i] += (int)a[i].s[j];
205 b_sum[0].s[j] += (int)
b[j].s[i];
209 lhs_offset_first_element_in_bytes += 1 *
sizeof(DATA_TYPE);
210 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
212 #endif // ((K % K0) != 0)
218 acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
226 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
227 #endif // defined(BIAS)
230 TILE(DATA_TYPE, M0, N0, accq);
235 TILE(
int, M0, 1, indirect_buffer);
241 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
243 #endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
245 #if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
292 __kernel
void mat_mul_native_quantized_nt_t(
305 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
306 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
307 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
310 TILE(
int, M0, N0, acc);
313 acc[i].v =
K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
316 TILE(
int, 1, M0, a_sum);
319 TILE(
int, 1, N0, b_sum);
323 for(k = 0; k <=
K - K0; k += K0)
325 TILE(DATA_TYPE, M0, K0, a);
326 TILE(DATA_TYPE, N0, K0,
b);
339 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
340 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y,
b);
342 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, K0, NT, T, a,
b, acc);
348 a_sum[0].s[i] += (int)a[i].s[j];
356 b_sum[0].s[i] += (int)
b[i].s[j];
360 lhs_offset_first_element_in_bytes += K0 *
sizeof(DATA_TYPE);
361 rhs_offset_first_element_in_bytes += K0 *
sizeof(DATA_TYPE);
368 TILE(DATA_TYPE, M0, 1, a);
369 TILE(DATA_TYPE, N0, 1,
b);
382 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
383 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y,
b);
385 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, 1, NT, T, a,
b, acc);
391 a_sum[0].s[i] += (int)a[i].s[j];
399 b_sum[0].s[i] += (int)
b[i].s[j];
403 lhs_offset_first_element_in_bytes += 1 *
sizeof(DATA_TYPE);
404 rhs_offset_first_element_in_bytes += 1 *
sizeof(DATA_TYPE);
406 #endif // ((K % K0) != 0)
412 acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((
int)(LHS_OFFSET)) * b_sum[0].s[j];
420 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
421 #endif // defined(BIAS)
424 TILE(DATA_TYPE, M0, N0, accq);
429 TILE(
int, M0, 1, indirect_buffer);
435 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
437 #endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
439 #if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
486 __kernel
void mat_mul_native_quantized_t_nt(
499 lhs_offset_first_element_in_bytes += y *
sizeof(DATA_TYPE) + z * lhs_stride_z;
500 rhs_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + z * rhs_stride_z;
501 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
504 TILE(
int, M0, N0, acc);
507 acc[i].v =
K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
510 TILE(
int, 1, N0, b_sum);
513 TILE(
int, 1, M0, a_sum);
517 for(k = 0; k <=
K - K0; k += K0)
519 TILE(DATA_TYPE, M0, K0, a);
520 TILE(DATA_TYPE, N0, K0,
b);
537 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, K0, NT, T, a,
b, acc);
543 a_sum[0].s[j] += (int)a[j].s[i];
551 b_sum[0].s[j] += (int)
b[j].s[i];
555 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
556 rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
563 TILE(DATA_TYPE, M0, 1, a);
564 TILE(DATA_TYPE, N0, 1,
b);
581 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, 1, NT, T, a,
b, acc);
587 a_sum[0].s[j] += (int)a[j].s[i];
595 b_sum[0].s[j] += (int)
b[j].s[i];
599 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
600 rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
602 #endif // ((K % K0) != 0)
608 acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((
int)(LHS_OFFSET)) * b_sum[0].s[j];
616 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
617 #endif // defined(BIAS)
620 TILE(DATA_TYPE, M0, N0, accq);
625 TILE(
int, M0, 1, indirect_buffer);
631 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
633 #endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
635 #if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
682 __kernel
void mat_mul_native_quantized_t_t(
695 lhs_offset_first_element_in_bytes += y *
sizeof(DATA_TYPE) + z * lhs_stride_z;
696 rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
697 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
700 TILE(
int, M0, N0, acc);
703 acc[i].v =
K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
706 TILE(
int, 1, M0, a_sum);
709 TILE(
int, 1, N0, b_sum);
713 for(k = 0; k <=
K - K0; k += K0)
715 TILE(DATA_TYPE, M0, K0, a);
716 TILE(DATA_TYPE, N0, K0,
b);
733 T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y,
b);
735 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, K0, NT, T, a,
b, acc);
741 a_sum[0].s[j] += (int)a[j].s[i];
749 b_sum[0].s[i] += (int)
b[i].s[j];
753 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
754 rhs_offset_first_element_in_bytes += K0 *
sizeof(DATA_TYPE);
761 TILE(DATA_TYPE, M0, 1, a);
762 TILE(DATA_TYPE, N0, 1,
b);
779 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y,
b);
781 T_MMUL(DATA_TYPE, DATA_TYPE,
int, M0, N0, 1, NT, T, a,
b, acc);
787 a_sum[0].s[j] += (int)a[j].s[i];
795 b_sum[0].s[i] += (int)
b[i].s[j];
799 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
800 rhs_offset_first_element_in_bytes += 1 *
sizeof(DATA_TYPE);
802 #endif // ((K % K0) != 0)
808 acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
816 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
817 #endif // defined(BIAS)
820 TILE(DATA_TYPE, M0, N0, accq);
825 TILE(
int, M0, 1, indirect_buffer);
831 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
833 #endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)