31 #if !defined(_WIN64) && !defined(__OpenBSD__) 41 #include <type_traits> 60 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
bool integrate_sums,
typename TIn,
typename TOut>
61 void interleave_block( TOut * &out,
const TIn *
const *in,
size_t width,
size_t height,
size_t row_offset,
bool first) {
62 const unsigned int int_by = height_vectors * (vlt ==
VLType::SVE ? get_vector_length<TOut>() / block : 1);
64 std::vector<int32_t> the_sums;
67 the_sums = std::vector<int32_t>(int_by, 0);
76 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
82 memcpy(the_sums.data(), out_int32, int_by *
sizeof(int32_t));
85 out =
reinterpret_cast<TOut *
>(out_int32);
89 for (
unsigned int pos=0; pos<width; pos+=block) {
90 for (
unsigned int row=0; row<int_by; row++) {
93 for (
unsigned int col=0; col<block; col++) {
99 for (
unsigned int col=0; col<block; col++) {
101 if (pos + col >= width) {
106 if (integrate_sums) {
107 the_sums[row] += in[row][row_offset + pos + col];
110 *out++ = in[row][row_offset + pos + col];
115 if (integrate_sums) {
116 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
118 memcpy(out_int32, the_sums.data(), int_by *
sizeof(int32_t));
120 out =
reinterpret_cast<TOut *
>(out_int32 + int_by);
124 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TOut>
125 inline void FixupRowSums(TOut * &out,
const int32_t row_sum_multiplier) {
126 const unsigned int height = height_vectors * (vlt ==
VLType::SVE ? get_vector_length<TOut>() / block : 1);
129 if (row_sum_multiplier) {
133 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
136 for (
unsigned int i=0; i<height; i++) {
137 out_int32[i] *= row_sum_multiplier;
142 int32_t *out_int32 =
reinterpret_cast<int32_t *
>(out);
144 for (
unsigned int i=0; i<height; i++) {
150 out =
reinterpret_cast<TOut *
>(out_int32);
154 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TIn,
typename TOut>
156 unsigned int rounded_stringlen,
const unsigned int y0,
const unsigned int ymax,
157 const unsigned int k0,
const unsigned int kmax,
bool integrate_sums,
158 const int32_t row_sum_multiplier) {
159 const unsigned int height = height_vectors * (vlt ==
VLType::SVE ? get_vector_length<TOut>() / block : 1);
169 const TIn **row_ptrs =
reinterpret_cast<const TIn **
>(alloca(height *
sizeof(
const TIn *)));
172 unsigned int start_string = k0 / rounded_stringlen;
173 unsigned int start_stringpos = k0 % rounded_stringlen;
176 for (
unsigned int ybase = y0; ybase < ymax; ybase+=height) {
178 unsigned int active_height = std::min(ymax - ybase, height);
181 unsigned int k_left = (kmax - k0);
182 unsigned int string = start_string;
183 unsigned int stringpos = start_stringpos;
190 unsigned int in_width = std::min(k_left, stringlen - stringpos);
191 unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
193 const TIn *
const *row_base = ptr[string] + ybase;
196 if (active_height < height) {
197 for (
unsigned int i=0; i<active_height; i++) {
198 row_ptrs[i] = ptr[string][ybase + i];
207 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
208 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
210 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
219 if (std::is_integral<TOut>::value && integrate_sums) {
220 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
225 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TIn,
typename TOut>
227 const unsigned int y0,
const unsigned int ymax,
const unsigned int k0,
const unsigned int kmax,
bool integrate_sums,
const int32_t row_sum_multiplier) {
228 const unsigned int height = height_vectors * (vlt ==
VLType::SVE ? get_vector_length<TOut>() / block : 1);
230 auto conv_cols = conv.
process_columns(in, in_stride, k0, kmax, rounded_stringlen);
233 const TIn **row_ptrs =
reinterpret_cast<const TIn **
>(alloca(height *
sizeof(
const TIn *)));
235 for (
unsigned int ybase = y0; ybase < ymax; ybase += height) {
237 unsigned int active_height = std::min(ymax - ybase, height);
240 auto conv_rows = conv_cols.process_rows(ybase, active_height);
242 while (!conv_rows.finished()) {
243 unsigned int width,
offset;
246 std::tie(width, offset) = conv_rows.next_block(row_ptrs);
249 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
250 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height,
offset, first);
252 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height,
offset, first);
258 if (std::is_integral<TOut>::value && integrate_sums) {
259 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
264 template<
unsigned int height_vectors,
unsigned int block, VLType vlt,
typename TIn,
typename TOut>
265 void Interleave(TOut *out,
const TIn *in,
size_t in_stride,
const unsigned int y0,
const unsigned int ymax,
const unsigned int k0,
const unsigned int kmax,
bool integrate_sums,
const int32_t row_sum_multiplier) {
266 const unsigned int height = height_vectors * (vlt ==
VLType::SVE ? get_vector_length<TOut>() / block : 1);
269 const TIn **row_ptrs =
reinterpret_cast<const TIn **
>(alloca(height *
sizeof(
const TIn *)));
271 const unsigned int width=kmax-k0;
273 for (
unsigned int y=y0; y<ymax; y+=height) {
274 for (
unsigned int r=0; r<height; r++) {
275 row_ptrs[r] = in + ((y + r) * in_stride);
278 if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
279 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0,
true);
281 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0,
true);
284 if (std::is_integral<TOut>::value && integrate_sums) {
285 FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
298 template void IndirectInterleave<6, 1, VLType::None>(
float *,
const float *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
299 template void ConvolutionInterleave<6, 1, VLType::None>(
float *,
const float *, size_t,
const convolver<float> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
300 template void Interleave<6, 1, VLType::None>(
float *,
const float *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
305 template void IndirectInterleave<6, 1, VLType::None>(
float *,
const __fp16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
306 template void ConvolutionInterleave<6, 1, VLType::None>(
float *,
const __fp16 *, size_t,
const convolver<__fp16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
307 template void Interleave<6, 1, VLType::None>(
float *,
const __fp16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
312 template void IndirectInterleave<6, 1, VLType::None>(
float *,
const bfloat16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
313 template void ConvolutionInterleave<6, 1, VLType::None>(
float *,
const bfloat16 *, size_t,
const convolver<bfloat16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
314 template void Interleave<6, 1, VLType::None>(
float *,
const bfloat16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
321 template void IndirectInterleave<8, 1, VLType::None>(
float *,
const float *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
322 template void ConvolutionInterleave<8, 1, VLType::None>(
float *,
const float *, size_t,
const convolver<float> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
323 template void Interleave<8, 1, VLType::None>(
float *,
const float *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
325 #if defined(ARM_COMPUTE_ENABLE_SVE) && defined(ARM_COMPUTE_ENABLE_SVEF32MM) 327 template void IndirectInterleave<8, 2, VLType::None>(
float *,
const float *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
328 template void ConvolutionInterleave<8, 2, VLType::None>(
float *,
const float *, size_t,
const convolver<float> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
329 template void Interleave<8, 2, VLType::None>(
float *,
const float *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
330 #endif // ARM_COMPUTE_ENABLE_SVE && ARM_COMPUTE_ENABLE_SVEF32MM 333 #if defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) 334 template void IndirectInterleave<8, 1, VLType::None>(__fp16 *,
const __fp16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
335 template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *,
const __fp16 *, size_t,
const convolver<__fp16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
336 template void Interleave<8, 1, VLType::None>(__fp16 *,
const __fp16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
337 #endif // FP16_KERNELS ar __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 339 template void IndirectInterleave<8, 1, VLType::None>(
float *,
const __fp16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
340 template void ConvolutionInterleave<8, 1, VLType::None>(
float *,
const __fp16 *, size_t,
const convolver<__fp16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
341 template void Interleave<8, 1, VLType::None>(
float *,
const __fp16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
345 #ifdef ARM_COMPUTE_ENABLE_BF16 346 template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *,
const bfloat16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
347 template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *,
const bfloat16 *, size_t,
const convolver<bfloat16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
348 template void Interleave<8, 2, VLType::None>(bfloat16 *,
const bfloat16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
350 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *,
const bfloat16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
351 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *,
const bfloat16 *, size_t,
const convolver<bfloat16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
352 template void Interleave<8, 4, VLType::None>(bfloat16 *,
const bfloat16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
354 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *,
const float *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
355 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *,
const float *, size_t,
const convolver<float> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
356 template void Interleave<8, 4, VLType::None>(bfloat16 *,
const float *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
357 #endif // ARM_COMPUTE_ENABLE_BF16 360 template void IndirectInterleave<8, 1, VLType::None>(
float *,
const bfloat16 *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
361 template void ConvolutionInterleave<8, 1, VLType::None>(
float *,
const bfloat16 *, size_t,
const convolver<bfloat16> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
362 template void Interleave<8, 1, VLType::None>(
float *,
const bfloat16 *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
365 template void IndirectInterleave<8, 1, VLType::None>(int16_t *,
const int16_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
366 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *,
const int16_t *, size_t,
const convolver<int16_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
367 template void Interleave<8, 1, VLType::None>(int16_t *,
const int16_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
369 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *,
const uint16_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
370 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *,
const uint16_t *, size_t,
const convolver<uint16_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
371 template void Interleave<8, 1, VLType::None>(uint16_t *,
const uint16_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
375 template void IndirectInterleave<4, 16, VLType::None>(int8_t *,
const int8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
376 template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *,
const int8_t *, size_t,
const convolver<int8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
377 template void Interleave<4, 16, VLType::None>(int8_t *,
const int8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
380 template void IndirectInterleave<8, 4, VLType::None>(int8_t *,
const int8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
381 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *,
const int8_t *, size_t,
const convolver<int8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
382 template void Interleave<8, 4, VLType::None>(int8_t *,
const int8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
385 template void IndirectInterleave<8, 8, VLType::None>(int8_t *,
const int8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
386 template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *,
const int8_t *, size_t,
const convolver<int8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
387 template void Interleave<8, 8, VLType::None>(int8_t *,
const int8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
390 template void IndirectInterleave<8, 1, VLType::None>(int16_t *,
const int8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
391 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *,
const int8_t *, size_t,
const convolver<int8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
392 template void Interleave<8, 1, VLType::None>(int16_t *,
const int8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
395 template void IndirectInterleave<4, 16, VLType::None>(uint8_t *,
const uint8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
396 template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *,
const uint8_t *, size_t,
const convolver<uint8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
397 template void Interleave<4, 16, VLType::None>(uint8_t *,
const uint8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
400 template void IndirectInterleave<8, 4, VLType::None>(uint8_t *,
const uint8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
401 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *,
const uint8_t *, size_t,
const convolver<uint8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
402 template void Interleave<8, 4, VLType::None>(uint8_t *,
const uint8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
405 template void IndirectInterleave<8, 8, VLType::None>(uint8_t *,
const uint8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
406 template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *,
const uint8_t *, size_t,
const convolver<uint8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
407 template void Interleave<8, 8, VLType::None>(uint8_t *,
const uint8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
410 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *,
const uint8_t *
const *
const *,
unsigned int,
unsigned int,
unsigned int y0,
unsigned int ymax,
unsigned int k0,
unsigned int kmax, bool, int32_t);
411 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *,
const uint8_t *, size_t,
const convolver<uint8_t> &,
unsigned int,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
412 template void Interleave<8, 1, VLType::None>(uint16_t *,
const uint8_t *, size_t,
unsigned int,
unsigned int,
unsigned int,
unsigned int, bool, int32_t);
413 #endif // __aarch64__ __global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Brain floating point representation class.
void IndirectInterleave(TOut *out, const TIn *const *const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
void FixupRowSums(TOut *&out, const int32_t row_sum_multiplier)
column_handler process_columns(const T *input_base, size_t input_stride, unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen) const
void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver< TIn > &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
void interleave_block(TOut *&out, const TIn *const *in, size_t width, size_t height, size_t row_offset, bool first)