50 std::memset(&out[0], 0, out.num_elements() *
sizeof(T));
52 const unsigned int K = in.
shape()[0];
53 const unsigned int M = in.
shape()[1];
54 const unsigned int B = in.
shape()[2];
56 const unsigned int num_tiles_x = std::ceil(
K /
static_cast<float>(lhs_info.
k0));
57 const unsigned int num_tiles_y = std::ceil(
M /
static_cast<float>(lhs_info.
m0));
71 const unsigned int offset_output_x = lhs_info.
interleave ? tile_to_use->
shape()[0] : tile_to_use->
shape()[0] * tile_to_use->
shape()[1];
72 const unsigned int step_output_x = lhs_info.
interleave ? tile_to_use->
shape()[0] * lhs_info.
v0 : tile_to_use->
shape()[0];
74 for(
unsigned int z = 0; z <
B; ++z)
76 for(
unsigned int y = 0; y < num_tiles_y; ++y)
78 for(
unsigned int x = 0; x < num_tiles_x; ++x)
81 get_tile<T>(in, src_tile,
Coordinates(x * lhs_info.
k0, y * lhs_info.
m0, z, 0));
86 transpose_matrix<T>(src_tile, src_tile_transposed);
90 const unsigned int offset_output = (x * lhs_info.
k0 * lhs_info.
m0 * lhs_info.
v0) + ((y % lhs_info.
v0) * offset_output_x) + ((y / lhs_info.
v0) * out.shape()[0]) + (z * out.shape()[0] * out.shape()[1]);
92 for(
unsigned int i = 0; i < tile_to_use->
shape()[1]; ++i)
94 const unsigned int offset_tile = i * tile_to_use->
shape()[0];
97 std::copy(&(*tile_to_use)[offset_tile], &(*tile_to_use)[offset_tile + tile_to_use->
shape()[0]], &out[offset_output + i * step_output_x]);