47 template <
unsigned int tIntBy,
unsigned int BlockBy,
bool Transposed,
size_t TOutSize,
size_t TInSize,
typename d_type, arm_gemm::VLType vlt>
50 template <
typename TOut,
typename TIn>
51 static void Transform(TOut &out,
const TIn in,
const int stride,
52 const int y0,
const int ymax,
const int x0,
const int xmax)
58 const unsigned int IntBy = tIntBy;
61 const int n_whole_y_blocks = (ymax - y0) / IntBy;
62 const int y_remainders = (ymax - y0) % IntBy;
63 const int n_y_blocks = n_whole_y_blocks + (y_remainders ? 1 : 0);
65 const int n_whole_x_blocks = (xmax - x0) / BlockBy;
66 const int x_remainders = (xmax - x0) % BlockBy;
67 const int n_x_blocks = n_whole_x_blocks + (x_remainders ? 1 : 0);
72 for(
int y_block = 0; y_block < n_y_blocks; y_block++)
74 const int fill_rows = (y_block < n_whole_y_blocks) ? IntBy : y_remainders;
75 const int blank_rows = IntBy - fill_rows;
77 const int y_base = y0 + (y_block * IntBy);
80 for(
int x_block = 0; x_block < n_x_blocks; x_block++)
82 const int fill_cols = (x_block < n_whole_x_blocks) ? BlockBy : x_remainders;
83 const int blank_cols = BlockBy - fill_cols;
85 const int x_base = x0 + (x_block * BlockBy);
87 for(
int row = 0; row < fill_rows; row++)
89 for(
int col = 0; col < fill_cols; col++)
94 out[out_index] = in[(x_base + col) * stride + y_base + row];
99 out[out_index] = in[(y_base + row) * stride + x_base + col];
104 for(
int col = 0; col < blank_cols; col++)
111 const d_type zeroval = 0;
112 const int pads = blank_rows * (fill_cols + blank_cols);
114 for(
int i = 0; i < pads; i++)
116 out[out_index] = zeroval;
125 template <
typename T>
129 const int cols =
src.shape()[0];
130 const int rows =
src.shape()[1];
136 Transform_ref<4, 1,
true,
sizeof(float),
sizeof(
float), float,
arm_gemm::VLType::None>::Transform<SimpleTensor<T> &,
SimpleTensor<T>>(
dst,
src,
rows, 0,
rows, 0,
cols);
141 Transform_ref<8, 1,
true,
sizeof(float),
sizeof(
float), float,
arm_gemm::VLType::None>::Transform<SimpleTensor<T> &,
SimpleTensor<T>>(
dst,
src,
rows, 0,
rows, 0,
cols);