Compute Library
 22.05
interleave_indirect.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "asmlib.hpp"
27 #include "convolver.hpp"
28 #include "interleave_indirect.hpp"
29 #include "bfloat.hpp"
30 
31 #if !defined(_WIN64) && !defined(__OpenBSD__)
32 #include <alloca.h>
33 #endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
34 
35 #include <algorithm>
36 #include <cstddef>
37 #include <cstdint>
38 #include <cstdio>
39 #include <cstring>
40 #include <tuple>
41 #include <type_traits>
42 #include <vector>
43 
44 #include <arm_neon.h>
45 
46 #include "utils.hpp"
47 
48 namespace arm_gemm {
49 
50 /*
51  * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
52  *
53  * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
54  * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
55  * with a particular value.
56  *
57  * Note that it is not expected for this templated version to ever be used - all cases that matter should be
58  * explicitly specialized with an optimized implementation.
59  */
60 template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
61 void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
62  const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
63 
64  std::vector<int32_t> the_sums;
65 
66  if (integrate_sums) {
67  the_sums = std::vector<int32_t>(int_by, 0);
68 
69  if (!first) {
70  // In 'integrate sums' mode, we dump the sums at the end on each pass.
71 
72  // On the last pass this is correct, but on other passes it is not -
73  // so on the subsequent pass we need to take the output written by
74  // the previous pass as starting point for the sums, and then
75  // overwrite them with new interleaved data.
76  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
77 
78  // Rewind pointer to where we wrote out the sums last time.
79  out_int32 -= int_by;
80 
81  // Restore the running sums.
82  memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
83 
84  // Update the "real" pointer so that the next output will clobber the old sums.
85  out = reinterpret_cast<TOut *>(out_int32);
86  }
87  }
88 
89  for (unsigned int pos=0; pos<width; pos+=block) {
90  for (unsigned int row=0; row<int_by; row++) {
91  // Row out of range - pad 'block' entries.
92  if (row >= height) {
93  for (unsigned int col=0; col<block; col++) {
94  *out++ = 0;
95  }
96  continue;
97  }
98 
99  for (unsigned int col=0; col<block; col++) {
100  // Column out of range - pad a single entry
101  if (pos + col >= width) {
102  *out++ = 0;
103  continue;
104  }
105 
106  if (integrate_sums) {
107  the_sums[row] += in[row][row_offset + pos + col];
108  }
109 
110  *out++ = in[row][row_offset + pos + col];
111  }
112  }
113  }
114 
115  if (integrate_sums) {
116  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
117 
118  memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
119 
120  out = reinterpret_cast<TOut *>(out_int32 + int_by);
121  }
122 }
123 
124 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
125 inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
126  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
127 
128  // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
129  if (row_sum_multiplier) {
130  // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
131  // next block (post sums).
132  // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
133  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
134 
135  out_int32 -= height;
136  for (unsigned int i=0; i<height; i++) {
137  out_int32[i] *= row_sum_multiplier;
138  }
139  } else {
140  // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
141  // sum block. We need to insert the (zero) sums, and advance 'out'.
142  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
143 
144  for (unsigned int i=0; i<height; i++) {
145  out_int32[i] = 0;
146  }
147 
148  out_int32 += height;
149 
150  out = reinterpret_cast<TOut *>(out_int32);
151  }
152 }
153 
154 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
155 void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
156  unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
157  const unsigned int k0, const unsigned int kmax, bool integrate_sums,
158  const int32_t row_sum_multiplier) {
159  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
160 
161  // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
162  // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
163  // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
164  // pointers and conditionally overriding the out of range ones.
165 
166  // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
167  // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
168  // expensive in highly threaded scenarios.
169  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
170 
171  // Figure out the starting position based on k0 (with rounded length)
172  unsigned int start_string = k0 / rounded_stringlen;
173  unsigned int start_stringpos = k0 % rounded_stringlen;
174 
175  // Process blocks of 'height' height...
176  for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
177  // Height to process
178  unsigned int active_height = std::min(ymax - ybase, height);
179 
180  // Track our progress through the various strings
181  unsigned int k_left = (kmax - k0);
182  unsigned int string = start_string;
183  unsigned int stringpos = start_stringpos;
184 
185  bool first = true;
186 
187  // Prepare to call 'interleave_block' above for each string encompassed by K range
188  while (k_left > 0) {
189  // Width to process - and the width we will generate (with padding)
190  unsigned int in_width = std::min(k_left, stringlen - stringpos);
191  unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
192 
193  const TIn * const *row_base = ptr[string] + ybase;
194 
195  // If not all rows are valid, copy the ones that are into local array (see above comment).
196  if (active_height < height) {
197  for (unsigned int i=0; i<active_height; i++) {
198  row_ptrs[i] = ptr[string][ybase + i];
199  }
200 
201  row_base = row_ptrs;
202  }
203 
204  // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
205  // much code. However, integrated sums make no sense for non-integral types and won't ever be
206  // requested. So put a type trait check here to avoid generating pointless code.
207  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
208  interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
209  } else {
210  interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
211  }
212 
213  k_left -= out_width;
214  string++;
215  stringpos=0;
216  first=false;
217  }
218 
219  if (std::is_integral<TOut>::value && integrate_sums) {
220  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
221  }
222  }
223 }
224 
225 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
226 void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
227  const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
228  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
229 
230  auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
231 
232  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
233  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
234 
235  for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
236  // How many of the rows are active - the rest will get padded in interleave_block.
237  unsigned int active_height = std::min(ymax - ybase, height);
238  bool first = true;
239 
240  auto conv_rows = conv_cols.process_rows(ybase, active_height);
241 
242  while (!conv_rows.finished()) {
243  unsigned int width, offset;
244 
245  // Get next set of parameters
246  std::tie(width, offset) = conv_rows.next_block(row_ptrs);
247 
248  // Perform the interleave
249  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
250  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
251  } else {
252  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
253  }
254 
255  first=false;
256  }
257 
258  if (std::is_integral<TOut>::value && integrate_sums) {
259  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
260  }
261  }
262 }
263 
264 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
265 void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
266  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
267 
268  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
269  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
270 
271  const unsigned int width=kmax-k0;
272 
273  for (unsigned int y=y0; y<ymax; y+=height) {
274  for (unsigned int r=0; r<height; r++) {
275  row_ptrs[r] = in + ((y + r) * in_stride);
276  }
277 
278  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
279  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280  } else {
281  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
282  }
283 
284  if (std::is_integral<TOut>::value && integrate_sums) {
285  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
286  }
287  }
288 }
289 
291 
292 /**** Instantiate needed implementations ****/
293 
294 /* AArch32 */
295 #ifdef __arm__
296 /* FP32 */
297 /* Arm® Neon™ implementation (height 6) */
298 template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
299 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
300 template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
301 
302 /* FP16 */
303 #if __ARM_FP16_ARGS
304 /* Arm® Neon™ implementation using FP32 kernel (height 6) */
305 template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
306 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
307 template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
308 #endif /* __ARM_FP16_ARGS */
309 
310 /* BF16 */
311 /* Arm® Neon™ implementation using FP32 kernel */
312 template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
313 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
314 template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
315 #endif
316 
317 /* AArch64 */
318 #ifdef __aarch64__
319 /* FP32 */
320 /* Arm® Neon™/SVE implementation (height 8) */
321 template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
322 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
323 template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
324 
325 #if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM)
326 /* FMMLA */
327 template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
328 template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
329 template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
330 #endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM
331 
332 /* FP16 */
333 #if defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
334 template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
335 template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
336 template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
337 #endif // FP16_KERNELS ar __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
338 
339 template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
340 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
341 template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
342 
343 /* BF16 */
344 /* Arm® Neon™/SVE BFDOT */
345 #ifdef ARM_COMPUTE_ENABLE_BF16
346 template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
347 template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
348 template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
349 
350 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
351 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
352 template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
353 
354 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
355 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
356 template void Interleave<8, 4, VLType::None>(bfloat16 *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
357 #endif // ARM_COMPUTE_ENABLE_BF16
358 
359 /* Arm® Neon™/SVE using FP32 kernel */
360 template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
361 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
362 template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
363 
364 /* INT16 */
365 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
366 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
367 template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
368 
369 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
370 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
371 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
372 
373 /* INT8 */
374 /* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
375 template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
376 template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
377 template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
378 
379 /* Arm® Neon™ SDOT (height 8, block 4) */
380 template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
381 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
382 template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
383 
384 /* MMLA SMMLA (height 8, block 8) */
385 template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
386 template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
387 template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
388 
389 /* Arm® Neon™ SDOT (height 8, block 1) */
390 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
391 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
392 template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
393 
394 /* Arm® Neon™ SMLA/SMLAL (height 4, block 16) */
395 template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
396 template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
397 template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
398 
399 /* Arm® Neon™ SDOT (height 8, block 4) */
400 template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
401 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
402 template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
403 
404 /* MMLA SMMLA (height 8, block 8) */
405 template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
406 template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
407 template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
408 
409 /* Arm® Neon™ 16-bit (height 8, block 1) */
410 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
411 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
412 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
413 #endif // __aarch64__
414 
415 } // namespace arm_gemm
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1083
Brain floating point representation class.
Definition: Bfloat16.h:81
void IndirectInterleave(TOut *out, const TIn *const *const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
void FixupRowSums(TOut *&out, const int32_t row_sum_multiplier)
column_handler process_columns(const T *input_base, size_t input_stride, unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen) const
Definition: convolver.hpp:176
void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver< TIn > &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
void interleave_block(TOut *&out, const TIn *const *in, size_t width, size_t height, size_t row_offset, bool first)