29 #if defined(RESHAPE_LHS_NT)
62 __kernel
void gemm_reshape_lhs_matrix_nt(
TENSOR3D_T(
src, BUFFER),
68 #define BLOCK_SIZE ((M0) * (K0))
71 #if defined(INTERLEAVE)
72 #define OUTPUT_OFFSET_X (K0)
73 #else // defined(INTERLEAVE)
74 #define OUTPUT_OFFSET_X (BLOCK_SIZE)
75 #endif // defined(INTERLEAVE)
78 #if defined(INTERLEAVE)
79 #define OUTPUT_STEP_X (K0) * (V0)
80 #else // Do not interleave
81 #define OUTPUT_STEP_X (K0)
82 #endif // defined(INTERLEAVE)
88 const int xi = x * K0;
89 const int yi = y * M0;
91 const int xo = x *
BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
92 const int yo = (y / V0);
95 src_offset_first_element_in_bytes += yi * src_stride_y + z *
M * src_stride_y;
96 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
98 TILE(DATA_TYPE, M0, K0, in);
106 bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
107 bool y_cond = (yi + M0 >=
M) && (PARTIAL_M0 != 0);
109 TILE(uint, M0, 1, in_indirect_y);
112 in_indirect_y[_i].v = _i;
118 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER,
src, xi, src_stride_y, x_cond, in, in_indirect_y);
121 #endif // PARTIAL_M0 != 0
123 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER,
src, xi, src_stride_y, x_cond, in, in_indirect_y);
127 TILE(uint, M0, 1, dst_indirect_y);
130 dst_indirect_y[_i].v = _i;
133 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER,
dst, xo, (OUTPUT_STEP_X *
sizeof(DATA_TYPE)),
false, in, dst_indirect_y);
135 #undef OUTPUT_OFFSET_X
138 #endif // defined(RESHAPE_LHS_NT)
140 #if defined(RESHAPE_LHS_T)
173 __kernel
void gemm_reshape_lhs_matrix_t(
TENSOR3D_T(
src, BUFFER),
179 #define BLOCK_SIZE ((M0) * (K0))
182 #if defined(INTERLEAVE)
183 #define OUTPUT_OFFSET_X (M0)
184 #else // defined(INTERLEAVE)
185 #define OUTPUT_OFFSET_X (BLOCK_SIZE)
186 #endif // defined(INTERLEAVE)
189 #if defined(INTERLEAVE)
190 #define OUTPUT_STEP_X (M0) * (V0)
191 #else // Do not interleave
192 #define OUTPUT_STEP_X (M0)
193 #endif // defined(INTERLEAVE)
199 const int xi = x * K0;
200 const int yi = y * M0;
202 const int xo = x *
BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
203 const int yo = (y / V0);
206 src_offset_first_element_in_bytes += yi * src_stride_y + z *
M * src_stride_y;
207 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
209 TILE(DATA_TYPE, M0, K0, in);
210 TILE(DATA_TYPE, K0, M0, in_tr);
219 bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
220 bool y_cond = (yi + M0 >=
M) && (PARTIAL_M0 != 0);
222 TILE(uint, M0, 1, in_indirect_y);
225 in_indirect_y[_i].v = _i;
231 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER,
src, xi, src_stride_y, x_cond, in, in_indirect_y);
234 #endif // PARTIAL_M0 != 0
236 T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER,
src, xi, src_stride_y, x_cond, in, in_indirect_y);
243 in_tr[k0].s[m0] = in[m0].s[k0];
247 TILE(uint, K0, 1, dst_indirect_y);
250 dst_indirect_y[_i].v = _i;
254 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER,
dst, xo, (OUTPUT_STEP_X *
sizeof(DATA_TYPE)),
false, in_tr, dst_indirect_y);
257 #undef OUTPUT_OFFSET_X
260 #endif // defined(RESHAPE_LHS_T)
262 #if defined(RESHAPE_RHS_NT)
290 __kernel
void gemm_reshape_rhs_matrix_nt(
TENSOR3D_T(
src, BUFFER),
295 #define BLOCK_SIZE ((K0) * (N0))
298 #if defined(INTERLEAVE)
299 #define OUTPUT_OFFSET_X (N0)
300 #else // defined(INTERLEAVE)
301 #define OUTPUT_OFFSET_X (BLOCK_SIZE)
302 #endif // defined(INTERLEAVE)
305 #if defined(INTERLEAVE)
306 #define OUTPUT_STEP_X (N0) * (H0)
307 #else // Do not interleave
308 #define OUTPUT_STEP_X (N0)
309 #endif // defined(INTERLEAVE)
315 const int xi = x * N0;
316 const int yi = y * K0;
318 const int xo = y *
BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
319 const int yo = (x / H0);
321 src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
322 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
324 TILE(DATA_TYPE, K0, N0, in);
327 for(
int i = 0; i < K0; ++i)
333 for(
int i = 0; i < K0; ++i)
337 in[i].v =
V_LOAD(DATA_TYPE, N0, BUFFER,
src, xi, i, src_stride_y);
341 TILE(uint, K0, 1, dst_indirect_y);
342 for(
int i = 0; i < K0; ++i)
344 dst_indirect_y[i].v = i;
347 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER,
dst, xo, (OUTPUT_STEP_X *
sizeof(DATA_TYPE)),
false, in, dst_indirect_y);
350 #undef OUTPUT_OFFSET_X
353 #endif // defined(RESHAPE_RHS_NT)
355 #if defined(RESHAPE_RHS_T)
384 __kernel
void gemm_reshape_rhs_matrix_t(
TENSOR3D_T(
src, BUFFER),
389 #define BLOCK_SIZE ((K0) * (N0))
392 #if defined(INTERLEAVE)
393 #define OUTPUT_OFFSET_X (K0)
394 #else // defined(INTERLEAVE)
395 #define OUTPUT_OFFSET_X (BLOCK_SIZE)
396 #endif // defined(INTERLEAVE)
399 #if defined(INTERLEAVE)
400 #define OUTPUT_STEP_X (K0) * (H0)
401 #else // Do not interleave
402 #define OUTPUT_STEP_X (K0)
403 #endif // defined(INTERLEAVE)
409 const int xi = x * N0;
410 const int yi = y * K0;
412 const int xo = y *
BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
413 const int yo = (x / H0);
415 src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
416 dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
418 TILE(DATA_TYPE, K0, N0, in);
419 TILE(DATA_TYPE, N0, K0, in_tr);
422 for(
int i = 0; i < K0; ++i)
428 for(
int i = 0; i < K0; ++i)
432 in[i].v =
V_LOAD(DATA_TYPE, N0, BUFFER,
src, xi, i, src_stride_y);
437 for(
int k0 = 0; k0 < K0; ++k0)
439 for(
int n0 = 0; n0 < N0; ++n0)
441 in_tr[n0].s[k0] = in[k0].s[n0];
445 TILE(uint, N0, 1, dst_indirect_y);
446 for(
int i = 0; i < N0; ++i)
448 dst_indirect_y[i].v = i;
451 T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER,
dst, xo, (OUTPUT_STEP_X *
sizeof(DATA_TYPE)),
false, in_tr, dst_indirect_y);
454 #undef OUTPUT_OFFSET_X
458 #endif // defined(RESHAPE_RHS_T)