40 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
bool integrate_sums,
typename TIn,
typename TOut>
41 void interleave_block( TOut * &out,
const TIn *
const *in,
size_t width,
size_t height,
size_t row_offset,
bool first) {
42 #ifdef ARM_COMPUTE_ENABLE_SVE
43 const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
44 (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
46 const unsigned int int_by = height_vectors;
49 std::vector<int32_t> the_sums;
52 the_sums = std::vector<int32_t>(int_by, 0);
61 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
67 memcpy(the_sums.data(), out_int32, int_by *
sizeof(int32_t));
70 out =
reinterpret_cast<TOut *
>(out_int32);
74 for (
unsigned int pos=0; pos<width; pos+=block) {
75 for (
unsigned int row=0; row<int_by; row++) {
78 for (
unsigned int col=0; col<block; col++) {
84 for (
unsigned int col=0; col<block; col++) {
86 if (pos + col >= width) {
92 the_sums[row] += in[row][row_offset + pos + col];
95 *out++ = in[row][row_offset + pos + col];
100 if (integrate_sums) {
101 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
103 memcpy(out_int32, the_sums.data(), int_by *
sizeof(int32_t));
105 out =
reinterpret_cast<TOut *
>(out_int32 + int_by);
109 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TOut>
110 inline void FixupRowSums(TOut * &out,
const int32_t row_sum_multiplier) {
111 #ifdef ARM_COMPUTE_ENABLE_SVE
112 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
113 (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
115 const unsigned int height = height_vectors;
119 if (row_sum_multiplier) {
123 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
126 for (
unsigned int i=0; i<height; i++) {
127 out_int32[i] *= row_sum_multiplier;
132 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
134 for (
unsigned int i=0; i<height; i++) {
140 out =
reinterpret_cast<TOut *
>(out_int32);
144 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TIn,
typename TOut>
146 unsigned int rounded_stringlen,
const unsigned int y0,
const unsigned int ymax,
147 const unsigned int k0,
const unsigned int kmax,
bool integrate_sums,
148 const int32_t row_sum_multiplier) {
149 #ifdef ARM_COMPUTE_ENABLE_SVE
150 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
151 (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
153 const unsigned int height = height_vectors;
164 const TIn **row_ptrs =
reinterpret_cast<const TIn **
>(alloca(height *
sizeof(
const TIn *)));
167 unsigned int start_string = k0 / rounded_stringlen;
168 unsigned int start_stringpos = k0 % rounded_stringlen;
171 for (
unsigned int ybase = y0; ybase < ymax; ybase+=height) {
173 unsigned int active_height = std::min(ymax - ybase, height);
176 unsigned int k_left = (kmax - k0);
177 unsigned int string = start_string;
178 unsigned int stringpos = start_stringpos;
185 unsigned int in_width = std::min(k_left, stringlen - stringpos);
186 unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
188 const TIn *
const *row_base = ptr[string] + ybase;
191 if (active_height < height) {
192 for (
unsigned int i=0; i<active_height; i++) {
193 row_ptrs[i] = ptr[string][ybase + i];
202 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
203 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
205 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
214 if (std::is_integral<TOut>::value && integrate_sums) {
215 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
220 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TIn,
typename TOut>
221 void ConvolutionInterleave(TOut *out,
const TIn *in,
size_t in_stride,
const convolver<TIn> &conv,
const unsigned int rounded_stringlen,
222 const unsigned int y0,
const unsigned int ymax,
const unsigned int k0,
const unsigned int kmax,
bool integrate_sums,
const int32_t row_sum_multiplier) {
223 #ifdef ARM_COMPUTE_ENABLE_SVE
224 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
225 (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
227 const unsigned int height = height_vectors;
229 auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
232 const TIn **row_ptrs =
reinterpret_cast<const TIn **
>(alloca(height *
sizeof(
const TIn *)));
234 for (
unsigned int ybase = y0; ybase < ymax; ybase += height) {
236 unsigned int active_height = std::min(ymax - ybase, height);
239 auto conv_rows = conv_cols.process_rows(ybase, active_height);
241 while (!conv_rows.finished()) {
242 unsigned int width,
offset;
245 std::tie(width,
offset) = conv_rows.next_block(row_ptrs);
248 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
249 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height,
offset, first);
251 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height,
offset, first);
257 if (std::is_integral<TOut>::value && integrate_sums) {
258 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
263 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TIn,
typename TOut>
264 void Interleave(TOut *out,
const TIn *in,
size_t in_stride,
const unsigned int y0,
const unsigned int ymax,
const unsigned int k0,
const unsigned int kmax,
bool integrate_sums,
const int32_t row_sum_multiplier) {
265 #ifdef ARM_COMPUTE_ENABLE_SVE
266 const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block :
267 (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 ));
269 const unsigned int height = height_vectors;
272 const TIn **row_ptrs =
reinterpret_cast<const TIn **
>(alloca(height *
sizeof(
const TIn *)));
274 const unsigned int width=kmax-k0;
276 for (
unsigned int y=y0; y<ymax; y+=height) {
277 for (
unsigned int r=0; r<height; r++) {
278 row_ptrs[r] = in + ((y + r) * in_stride);
281 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
282 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0,
true);
284 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0,
true);
287 if (std::is_integral<TOut>::value && integrate_sums) {
288 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);