31 inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes,
TILE(DATA_TYPE, M0, N0, acc), uint x)
33 TILE(DATA_TYPE, 1, N0, bias_tile);
36 T_LOAD(DATA_TYPE, 1, N0, BUFFER,
bias, x, 0, 1, 0, bias_tile);
41 #endif // defined(BIAS)
43 #if defined(MAT_MUL_NATIVE_NT_NT)
91 __kernel
void mat_mul_native_nt_nt(
104 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
105 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
108 TILE(DATA_TYPE, M0, N0, acc);
115 const int rhs_z = z * rhs_h;
117 for(k = 0; k <=
K - K0; k += K0)
119 TILE(DATA_TYPE, M0, K0, a);
120 TILE(DATA_TYPE, K0, N0,
b);
133 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
134 T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y,
b);
136 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a,
b, acc);
138 lhs_offset_first_element_in_bytes += K0 *
sizeof(DATA_TYPE);
145 TILE(DATA_TYPE, M0, 1, a);
146 TILE(DATA_TYPE, 1, N0,
b);
159 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
160 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y,
b);
162 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a,
b, acc);
164 lhs_offset_first_element_in_bytes += 1 *
sizeof(DATA_TYPE);
166 #endif // K % K0 != 0
171 TILE(
int, M0, 1, indirect_buffer);
178 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
179 #endif // defined(BIAS)
181 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
183 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
185 #endif // defined(MAT_MUL_NATIVE_NT_NT)
187 #if defined(MAT_MUL_NATIVE_NT_T)
235 __kernel
void mat_mul_native_nt_t(
TENSOR3D_T(lhs, BUFFER),
248 lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
249 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
252 TILE(DATA_TYPE, M0, N0, acc);
259 const int rhs_z = z * rhs_h;
261 for(k = 0; k <=
K - K0; k += K0)
263 TILE(DATA_TYPE, M0, K0, a);
264 TILE(DATA_TYPE, N0, K0,
b);
277 T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
278 T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y,
b);
280 #if GPU_ARCH == GPU_ARCH_MIDGARD
286 TILE(DATA_TYPE, K0, N0, bt);
291 bt[j].s[i] =
b[i].s[j];
294 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);
295 #else // GPU_ARCH == GPU_ARCH_MIDGARD
296 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a,
b, acc);
297 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
299 lhs_offset_first_element_in_bytes += K0 *
sizeof(DATA_TYPE);
306 TILE(DATA_TYPE, M0, 1, a);
307 TILE(DATA_TYPE, N0, 1,
b);
320 T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
321 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y,
b);
323 #if GPU_ARCH == GPU_ARCH_MIDGARD
325 TILE(DATA_TYPE, 1, N0, bt);
328 bt[0].s[i] =
b[i].s[0];
330 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);
331 #else // GPU_ARCH == GPU_ARCH_MIDGARD
332 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a,
b, acc);
333 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
335 lhs_offset_first_element_in_bytes += 1 *
sizeof(DATA_TYPE);
337 #endif // K % K0 != 0
342 TILE(
int, M0, 1, indirect_buffer);
349 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
350 #endif // defined(BIAS)
352 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
354 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
356 #endif // defined(MAT_MUL_NATIVE_NT_T)
358 #if defined(MAT_MUL_NATIVE_T_NT)
406 __kernel
void mat_mul_native_t_nt(
419 lhs_offset_first_element_in_bytes += y *
sizeof(DATA_TYPE) + z * lhs_stride_z;
420 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
423 TILE(DATA_TYPE, M0, N0, acc);
430 const int rhs_z = z * rhs_h;
432 for(k = 0; k <=
K - K0; k += K0)
434 TILE(DATA_TYPE, K0, M0, a);
435 TILE(DATA_TYPE, K0, N0,
b);
448 T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
449 T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y,
b);
451 #if GPU_ARCH == GPU_ARCH_MIDGARD
453 TILE(DATA_TYPE, M0, K0, at);
458 at[j].s[i] = a[i].s[j];
461 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at,
b, acc);
462 #else // GPU_ARCH == GPU_ARCH_MIDGARD
463 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a,
b, acc);
464 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
466 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
473 TILE(DATA_TYPE, 1, M0, a);
474 TILE(DATA_TYPE, 1, N0,
b);
487 T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
488 T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y,
b);
490 #if GPU_ARCH == GPU_ARCH_MIDGARD
492 TILE(DATA_TYPE, M0, 1, at);
495 at[j].s[0] = a[0].s[j];
497 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at,
b, acc);
498 #else // GPU_ARCH == GPU_ARCH_MIDGARD
499 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a,
b, acc);
500 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
502 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
504 #endif // K % K0 != 0
509 TILE(
int, M0, 1, indirect_buffer);
516 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
517 #endif // defined(BIAS)
519 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
521 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
523 #endif // defined(MAT_MUL_NATIVE_T_NT)
525 #if defined(MAT_MUL_NATIVE_T_T)
573 __kernel
void mat_mul_native_t_t(
586 lhs_offset_first_element_in_bytes += y *
sizeof(DATA_TYPE) + z * lhs_stride_z;
587 dst_offset_first_element_in_bytes += x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
590 TILE(DATA_TYPE, M0, N0, acc);
597 const int rhs_z = z * rhs_h;
599 for(k = 0; k <=
K - K0; k += K0)
601 TILE(DATA_TYPE, K0, M0, a);
602 TILE(DATA_TYPE, N0, K0,
b);
615 T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
616 T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y,
b);
617 #if GPU_ARCH == GPU_ARCH_MIDGARD
619 TILE(DATA_TYPE, M0, K0, at);
620 TILE(DATA_TYPE, K0, N0, bt);
626 at[j].s[i] = a[i].s[j];
634 bt[j].s[i] =
b[i].s[j];
638 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);
639 #else // GPU_ARCH == GPU_ARCH_MIDGARD
640 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a,
b, acc);
641 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
643 lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
650 TILE(DATA_TYPE, 1, M0, a);
651 TILE(DATA_TYPE, N0, 1,
b);
664 T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
665 T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y,
b);
667 #if GPU_ARCH == GPU_ARCH_MIDGARD
669 TILE(DATA_TYPE, M0, 1, at);
670 TILE(DATA_TYPE, 1, N0, bt);
674 at[j].s[0] = a[0].s[j];
679 bt[0].s[i] =
b[i].s[0];
682 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);
683 #else // GPU_ARCH == GPU_ARCH_MIDGARD
684 T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a,
b, acc);
685 #endif // GPU_ARCH == GPU_ARCH_MIDGARD
687 lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
689 #endif // K % K0 != 0
694 TILE(
int, M0, 1, indirect_buffer);
701 perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
702 #endif // defined(BIAS)
704 T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
706 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0,
PARTIAL_STORE_N0, BUFFER,
dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
708 #endif // defined(MAT_MUL_NATIVE_T_T)