50 std::memset(&out[0], 0, out.num_elements() *
sizeof(T));
52 const unsigned int K = in.
shape()[0];
53 const unsigned int M = in.
shape()[1];
54 const unsigned int B = in.
shape()[2];
56 const unsigned int num_tiles_x = std::ceil(K / static_cast<float>(lhs_info.
k0));
57 const unsigned int num_tiles_y = std::ceil(M / static_cast<float>(lhs_info.
m0));
71 const unsigned int offset_output_x = lhs_info.
interleave ? tile_to_use->
shape()[0] : tile_to_use->
shape()[0] * tile_to_use->
shape()[1];
72 const unsigned int step_output_x = lhs_info.
interleave ? tile_to_use->
shape()[0] * lhs_info.
v0 : tile_to_use->
shape()[0];
74 for(
unsigned int z = 0; z < B; ++z)
76 for(
unsigned int y = 0; y < num_tiles_y; ++y)
78 for(
unsigned int x = 0; x < num_tiles_x; ++x)
81 get_tile<T>(in, src_tile,
Coordinates(x * lhs_info.
k0, y * lhs_info.
m0, z, 0));
86 transpose_matrix<T>(src_tile, src_tile_transposed);
90 const unsigned int offset_output = (x * lhs_info.
k0 * lhs_info.
m0 * lhs_info.
v0) + ((y % lhs_info.
v0) * offset_output_x) + ((y / lhs_info.
v0) * out.shape()[0]) + (z * out.shape()[0] * out.shape()[1]);
92 for(
unsigned int i = 0; i < tile_to_use->
shape()[1]; ++i)
94 const unsigned int offset_tile = i * tile_to_use->
shape()[0];
97 std::copy(&(*tile_to_use)[offset_tile], &(*tile_to_use)[offset_tile + tile_to_use->
shape()[0]], &out[offset_output + i * step_output_x]);
unsigned int v0
Number of vertical blocks of size (m0xk0) stored on the same output row.
DataType data_type() const override
Data type of the tensor.
SimpleTensor< T > gemm_reshape_lhs_matrix(const SimpleTensor< T > &in, const TensorShape &output_shape, const GEMMLHSMatrixInfo &lhs_info)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
GEMM LHS (Left Hand Side) matrix information.
TensorShape shape() const override
Shape of the tensor.
SimpleTensor< T > copy(const SimpleTensor< T > &src, const TensorShape &output_shape)
bool interleave
True if the v0 (m0xk0) blocks have to be interleaved in the output row.
Copyright (c) 2017-2021 Arm Limited.
bool transpose
True if the (m0xk0) block has to be transposed before been stored.
Simple tensor object that stores elements in a consecutive chunk of memory.
unsigned int k0
Number of partial accumulations performed by the matrix multiplication.
unsigned int m0
Number of rows processed by the matrix multiplication.