49 const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX);
51 inline void store_results(
const int32x4_t &out,
const int32x4_t &out2, int16_t *output)
53 const int16x8_t s16results = vcombine_s16(vqmovn_s32(out),
55 vst1q_s16(output, s16results);
58 inline void store_results(
const int32x4_t &out,
const int32x4_t &out2, uint8_t *output)
60 const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out),
62 vst1_u8(output, u8results);
65 inline void store_results(
const uint32x4_t &out,
const uint32x4_t &out2, int16_t *output)
67 const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2));
68 const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16));
69 vst1q_s16(output, s16results);
72 inline void store_results(
const uint32x4_t &out,
const uint32x4_t &out2, uint8_t *output)
74 const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out),
76 vst1_u8(output, u8results);
79 inline void store_results(
const int16x8_t &out,
const int16x8_t &out2, int16_t *output)
81 vst1q_s16(output, out);
82 vst1q_s16(output + 8, out2);
85 inline void store_results(
const int16x8_t &out,
const int16x8_t &out2, uint8_t *output)
87 const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out),
89 vst1q_u8(output, u8results);
92 inline void store_results(
const uint16x8_t &out,
const uint16x8_t &out2, uint8_t *output)
94 const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out),
96 vst1q_u8(output, u8results);
99 inline void store_results(
const uint16x8_t &out,
const uint16x8_t &out2, int16_t *output)
101 vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16)));
102 vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16)));
105 inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2,
const uint8x16_t &row_data,
const int16x4_t &mat0,
const int16x4_t &mat1,
const int16x4_t &mat2)
108 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
109 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
111 const int16x4x3_t row =
114 vget_low_s16(s16_tmp0),
115 vget_high_s16(s16_tmp0),
116 vget_low_s16(s16_tmp1)
121 out = vmlal_s16(out, row.val[0], mat0);
123 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
125 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
128 out2 = vmlal_s16(out2, row.val[1], mat0);
130 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
132 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
135 inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2,
const uint8x16_t &row_data,
const int16_t *
convolution)
137 const int16x4_t mat0 = vld1_dup_s16(convolution);
138 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
139 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
141 convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2);
144 inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2,
const uint8x16_t &row_data,
const int16_t *
convolution)
146 const int16x4_t mat0 = vld1_dup_s16(convolution);
147 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
148 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
149 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
150 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
153 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
154 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
156 const int16x4x3_t row =
159 vget_low_s16(s16_tmp0),
160 vget_high_s16(s16_tmp0),
161 vget_low_s16(s16_tmp1)
166 out = vmlal_s16(out, row.val[0], mat0);
168 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
170 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
172 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
174 out = vmlal_s16(out, row.val[1], mat4);
177 out2 = vmlal_s16(out2, row.val[1], mat0);
179 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
181 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
183 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
185 out2 = vmlal_s16(out2, row.val[2], mat4);
188 inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2,
const uint8x16_t &row_data,
const int16_t *convolution)
190 const int16x4_t mat0 = vld1_dup_s16(convolution);
191 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
192 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
193 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
194 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
195 const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
196 const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
199 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
200 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
202 const int16x4x4_t row =
205 vget_low_s16(s16_tmp0),
206 vget_high_s16(s16_tmp0),
207 vget_low_s16(s16_tmp1),
208 vget_high_s16(s16_tmp1)
213 out = vmlal_s16(out, row.val[0], mat0);
215 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
217 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
219 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
221 out = vmlal_s16(out, row.val[1], mat4);
223 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
225 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
228 out2 = vmlal_s16(out2, row.val[1], mat0);
230 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
232 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
234 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
236 out2 = vmlal_s16(out2, row.val[2], mat4);
238 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
240 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
243 inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2,
const uint8x16_t &row_data,
const int16_t *convolution)
245 const int16x4_t mat0 = vld1_dup_s16(convolution);
246 const int16x4_t mat1 = vld1_dup_s16(convolution + 1);
247 const int16x4_t mat2 = vld1_dup_s16(convolution + 2);
248 const int16x4_t mat3 = vld1_dup_s16(convolution + 3);
249 const int16x4_t mat4 = vld1_dup_s16(convolution + 4);
250 const int16x4_t mat5 = vld1_dup_s16(convolution + 5);
251 const int16x4_t mat6 = vld1_dup_s16(convolution + 6);
252 const int16x4_t mat7 = vld1_dup_s16(convolution + 7);
253 const int16x4_t mat8 = vld1_dup_s16(convolution + 8);
256 const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data)));
257 const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data)));
259 const int16x4x4_t row =
262 vget_low_s16(s16_tmp0),
263 vget_high_s16(s16_tmp0),
264 vget_low_s16(s16_tmp1),
265 vget_high_s16(s16_tmp1)
270 out = vmlal_s16(out, row.val[0], mat0);
272 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1);
274 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2);
276 out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3);
278 out = vmlal_s16(out, row.val[1], mat4);
280 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5);
282 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6);
284 out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7);
286 out = vmlal_s16(out, row.val[2], mat8);
289 out2 = vmlal_s16(out2, row.val[1], mat0);
291 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1);
293 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2);
295 out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3);
297 out2 = vmlal_s16(out2, row.val[2], mat4);
299 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5);
301 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6);
303 out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7);
305 out2 = vmlal_s16(out2, row.val[3], mat8);
313 template <
unsigned int matrix_size>
319 template <
unsigned int matrix_size>
325 template <
unsigned int matrix_size>
339 std::copy_n(conv, _convolution.size(), _convolution.begin());
352 constexpr
unsigned int num_elems_read_per_iteration = 16;
353 constexpr
unsigned int num_elems_written_per_iteration = 8;
364 INEKernel::configure(win);
368 template <
typename OutputType>
371 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
378 const int16x4_t mat00 = vld1_dup_s16(_convolution.data());
379 const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1);
380 const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2);
381 const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3);
382 const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4);
383 const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5);
384 const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6);
385 const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7);
386 const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8);
387 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
389 const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-1, -1));
390 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-1, 0));
391 const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-1, 1));
395 int32x4_t out = vdupq_n_s32(0);
396 int32x4_t out2 = vdupq_n_s32(0);
399 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
400 convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02);
403 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
404 convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12);
407 const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset());
408 convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22);
414 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
415 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
425 template <
typename OutputType>
428 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
434 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
436 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-2, -2));
437 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-2, -1));
438 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-2, 0));
439 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-2, 1));
440 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-2, 2));
444 int32x4_t out = vdupq_n_s32(0);
445 int32x4_t out2 = vdupq_n_s32(0);
448 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
449 convolve_row5x1(out, out2, data_t2, _convolution.data());
452 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
453 convolve_row5x1(out, out2, data_t1, _convolution.data() + 5);
456 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
457 convolve_row5x1(out, out2, data_m, _convolution.data() + 10);
460 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
461 convolve_row5x1(out, out2, data_b1, _convolution.data() + 15);
464 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
465 convolve_row5x1(out, out2, data_b2, _convolution.data() + 20);
471 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
472 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
482 template <
typename OutputType>
485 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
491 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
493 const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, -3));
494 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, -2));
495 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, -1));
496 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, 0));
497 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, 1));
498 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, 2));
499 const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-3, 3));
503 int32x4_t out = vdupq_n_s32(0);
504 int32x4_t out2 = vdupq_n_s32(0);
507 const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
508 convolve_row7x1(out, out2, data_t3, _convolution.data());
511 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
512 convolve_row7x1(out, out2, data_t2, _convolution.data() + 7);
515 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
516 convolve_row7x1(out, out2, data_t1, _convolution.data() + 14);
519 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
520 convolve_row7x1(out, out2, data_m, _convolution.data() + 21);
523 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
524 convolve_row7x1(out, out2, data_b1, _convolution.data() + 28);
527 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
528 convolve_row7x1(out, out2, data_b2, _convolution.data() + 35);
531 const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
532 convolve_row7x1(out, out2, data_b3, _convolution.data() + 42);
538 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
539 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
549 template <
typename OutputType>
552 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
558 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
560 const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, -4));
561 const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, -3));
562 const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, -2));
563 const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, -1));
564 const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, 0));
565 const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, 1));
566 const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, 2));
567 const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, 3));
568 const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(-4, 4));
572 int32x4_t out = vdupq_n_s32(0);
573 int32x4_t out2 = vdupq_n_s32(0);
576 const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset());
577 convolve_row9x1(out, out2, data_t4, _convolution.data());
580 const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset());
581 convolve_row9x1(out, out2, data_t3, _convolution.data() + 9);
584 const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset());
585 convolve_row9x1(out, out2, data_t2, _convolution.data() + 18);
588 const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset());
589 convolve_row9x1(out, out2, data_t1, _convolution.data() + 27);
592 const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset());
593 convolve_row9x1(out, out2, data_m, _convolution.data() + 36);
596 const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset());
597 convolve_row9x1(out, out2, data_b1, _convolution.data() + 45);
600 const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset());
601 convolve_row9x1(out, out2, data_b2, _convolution.data() + 54);
604 const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset());
605 convolve_row9x1(out, out2, data_b3, _convolution.data() + 63);
608 const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset());
609 convolve_row9x1(out, out2, data_b4, _convolution.data() + 72);
615 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
616 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
625 template <
unsigned int matrix_size>
632 switch(_output->info()->data_type())
635 convolution<uint8_t>(
window);
638 convolution<int16_t>(
window);
655 template <
unsigned int matrix_size>
657 : _conv_row{ { 0 } }, _border_size(0)
661 template <
unsigned int matrix_size>
667 template <
unsigned int matrix_size>
680 std::copy_n(conv_row, _conv_row.size(), _conv_row.begin());
681 _border_size =
BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2);
685 constexpr
unsigned int num_elems_read_per_iteration = 16;
686 constexpr
unsigned int num_elems_written_per_iteration = 8;
697 INEKernel::configure(win);
700 template <
unsigned int matrix_size>
706 switch(_output->info()->data_type())
709 convolve<uint16_t>(
window);
712 convolve<int16_t>(
window);
715 convolve<int32_t>(
window);
735 const uint8x16_t data = vld1q_u8(input.
ptr());
737 const uint16x8x2_t data_u16 =
740 vmovl_u8(vget_low_u8(data)),
741 vmovl_u8(vget_high_u8(data))
745 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
746 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
747 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
748 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
749 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
751 vst1q_u16(reinterpret_cast<uint16_t *>(output.
ptr()), out);
768 const uint8x16_t data = vld1q_u8(input.
ptr());
770 const int16x8x2_t data_s16 =
773 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
774 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
778 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
779 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
780 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
781 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
782 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
784 vst1q_s16(reinterpret_cast<int16_t *>(output.
ptr()), out);
801 const uint8x16_t data = vld1q_u8(input.
ptr());
803 const int16x8x2_t data_s16 =
806 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
807 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
811 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
812 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
813 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
814 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
816 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
817 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]);
818 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]);
819 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]);
820 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]);
822 vst1q_s32(reinterpret_cast<int32_t *>(output.
ptr()), out_low);
824 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
825 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]);
826 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]);
827 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]);
828 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]);
830 vst1q_s32(reinterpret_cast<int32_t *>(output.
ptr()) + 4, out_high);
847 const uint8x16_t data = vld1q_u8(input.
ptr());
849 const uint16x8x2_t data_u16 =
852 vmovl_u8(vget_low_u8(data)),
853 vmovl_u8(vget_high_u8(data))
857 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
858 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
859 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
860 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
861 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
862 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
863 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
865 vst1q_u16(reinterpret_cast<uint16_t *>(output.
ptr()), out);
882 const uint8x16_t data = vld1q_u8(input.
ptr());
884 const int16x8x2_t data_s16 =
887 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
888 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
892 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
893 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
894 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
895 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
896 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
897 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
898 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
900 vst1q_s16(reinterpret_cast<int16_t *>(output.
ptr()), out);
917 const uint8x16_t data = vld1q_u8(input.
ptr());
919 const int16x8x2_t data_s16 =
922 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
923 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
927 const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
928 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
929 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
930 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
931 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
932 const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
934 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
935 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]);
936 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]);
937 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]);
938 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]);
939 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]);
940 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]);
942 vst1q_s32(reinterpret_cast<int32_t *>(output.
ptr()), out_low);
944 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
945 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]);
946 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]);
947 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]);
948 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]);
949 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]);
950 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]);
952 vst1q_s32(reinterpret_cast<int32_t *>(output.
ptr()) + 4, out_high);
969 const uint8x16_t data = vld1q_u8(input.
ptr());
971 const uint16x8x2_t data_u16 =
974 vmovl_u8(vget_low_u8(data)),
975 vmovl_u8(vget_high_u8(data))
979 uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]);
980 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]);
981 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]);
982 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]);
983 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]);
984 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]);
985 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]);
986 out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]);
987 out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]);
989 vst1q_u16(reinterpret_cast<uint16_t *>(output.
ptr()), out);
1006 const uint8x16_t data = vld1q_u8(input.
ptr());
1008 const int16x8x2_t data_s16 =
1011 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
1012 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
1016 int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]);
1017 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]);
1018 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]);
1019 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]);
1020 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]);
1021 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]);
1022 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]);
1023 out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]);
1024 out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]);
1026 vst1q_s16(reinterpret_cast<int16_t *>(output.
ptr()), out);
1043 const uint8x16_t data = vld1q_u8(input.
ptr());
1045 const int16x8x2_t data_s16 =
1048 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
1049 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
1053 const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1);
1054 const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2);
1055 const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3);
1056 const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4);
1057 const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5);
1058 const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6);
1059 const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7);
1061 int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]);
1062 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]);
1063 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]);
1064 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]);
1065 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]);
1066 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]);
1067 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]);
1068 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]);
1069 out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]);
1071 vst1q_s32(reinterpret_cast<int32_t *>(output.
ptr()), out_low);
1073 int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]);
1074 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]);
1075 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]);
1076 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]);
1077 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]);
1078 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]);
1079 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]);
1080 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]);
1081 out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]);
1083 vst1q_s32(reinterpret_cast<int32_t *>(output.
ptr()) + 4, out_high);
1092 template <
unsigned int matrix_size>
1094 : _conv_col{ { 0 } }, _scale(0)
1098 template <
unsigned int matrix_size>
1104 template <
unsigned int matrix_size>
1118 std::copy_n(conv_col, _conv_col.size(), _conv_col.begin());
1123 constexpr
unsigned int num_elems_read_per_iteration = 16;
1124 constexpr
unsigned int num_elems_written_per_iteration = 16;
1135 INEKernel::configure(win);
1138 template <
unsigned int matrix_size>
1145 switch(_input->info()->data_type())
1148 switch(_output->info()->data_type())
1151 convolution_u16<uint8_t>(
window);
1154 convolution_u16<int16_t>(
window);
1161 switch(_output->info()->data_type())
1164 convolution_s16<uint8_t>(
window);
1167 convolution_s16<int16_t>(
window);
1174 switch(_output->info()->data_type())
1177 convolution_s32<uint8_t>(
window);
1180 convolution_s32<int16_t>(
window);
1192 template <
unsigned int matrix_size>
1193 template <
typename OutputType>
1196 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
1204 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1205 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1206 const int k_half = matrix_size / 2;
1209 for(
int i = -k_half; i <= k_half; ++i)
1211 input_ptrs[k_half + i] = _input->ptr_to_element(
Coordinates(0, i));
1216 uint16x8_t out0 = vdupq_n_u16(0);
1217 uint16x8_t out1 = vdupq_n_u16(0);
1220 for(
unsigned int r = 0; r < matrix_size; ++r)
1222 const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.
offset()));
1223 out0 = vmlaq_n_u16(out0, data, _conv_col[r]);
1229 for(
unsigned int r = 0; r < matrix_size; ++r)
1231 const uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(input_ptrs[r] + in.
offset()));
1232 out1 = vmlaq_n_u16(out1, data, _conv_col[r]);
1238 float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0)));
1239 float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0)));
1240 out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
1241 out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
1242 store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.
ptr()));
1244 float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1)));
1245 float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1)));
1246 out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
1247 out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
1248 store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.
ptr()) + 8);
1258 template <
unsigned int matrix_size>
1259 template <
typename OutputType>
1262 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
1270 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1271 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1272 const int k_half = matrix_size / 2;
1275 for(
int i = -k_half; i <= k_half; ++i)
1277 input_ptrs[k_half + i] = _input->ptr_to_element(
Coordinates(0, i));
1282 int16x8_t out0 = vdupq_n_s16(0);
1283 int16x8_t out1 = vdupq_n_s16(0);
1286 for(
unsigned int r = 0; r < matrix_size; ++r)
1288 const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.
offset()));
1289 out0 = vmlaq_n_s16(out0, data, _conv_col[r]);
1295 for(
unsigned int r = 0; r < matrix_size; ++r)
1297 const int16x8_t data = vld1q_s16(reinterpret_cast<const int16_t *>(input_ptrs[r] + in.
offset()));
1298 out1 = vmlaq_n_s16(out1, data, _conv_col[r]);
1304 float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0)));
1305 float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0)));
1306 out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale);
1307 out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale);
1308 store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast<OutputType *>(out.
ptr()));
1310 float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1)));
1311 float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1)));
1312 out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale);
1313 out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale);
1314 store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast<OutputType *>(out.
ptr()) + 8);
1324 template <
unsigned int matrix_size>
1325 template <
typename OutputType>
1328 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
1336 std::array<unsigned char *, matrix_size> input_ptrs{ {} };
1337 const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale);
1338 const int k_half = matrix_size / 2;
1341 for(
int i = -k_half; i <= k_half; ++i)
1343 input_ptrs[k_half + i] = _input->ptr_to_element(
Coordinates(0, i));
1346 const int32x4_t zero = vdupq_n_s32(0);
1367 for(
unsigned int r = 0; r < matrix_size; ++r)
1369 const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.
offset()));
1370 out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]);
1371 out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]);
1377 for(
unsigned int r = 0; r < matrix_size; ++r)
1379 const int32x4x2_t data = vld2q_s32(reinterpret_cast<const int32_t *>(input_ptrs[r] + in.
offset()));
1380 out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]);
1381 out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]);
1387 float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]);
1388 float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]);
1389 out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale);
1390 out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale);
1391 out0.val[0] = vcvtq_s32_f32(out0_f32_odd);
1392 out0.val[1] = vcvtq_s32_f32(out0_f32_even);
1394 float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]);
1395 float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]);
1396 out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale);
1397 out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale);
1398 out1.val[0] = vcvtq_s32_f32(out1_f32_odd);
1399 out1.val[1] = vcvtq_s32_f32(out1_f32_even);
1402 const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]);
1403 store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast<OutputType *>(out.
ptr()));
1405 const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]);
1406 store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast<OutputType *>(out.
ptr()) + 8);
1420 : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0)
1426 return _border_size;
1445 _border_size =
BorderSize(height / 2, width / 2);
1448 const uint32_t nr_elements = width * height;
1449 _convolution.resize(nr_elements);
1450 std::copy_n(conv, nr_elements, _convolution.begin());
1453 _func_idx = get_index(height) * 4 + get_index(width);
1458 constexpr
unsigned int num_elems_read_per_iteration = 16;
1459 constexpr
unsigned int num_elems_written_per_iteration = 8;
1468 output_access.set_valid_region(win, input->
info()->
valid_region(), border_undefined, _border_size);
1470 INEKernel::configure(win);
1482 static const std::array<ConvolutionRectangleFunction, 16> func_table_u8 =
1485 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 3>,
1486 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 5>,
1487 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 7>,
1488 &NEConvolutionRectangleKernel::convolution<uint8_t, 3, 9>,
1489 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 3>,
1490 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 5>,
1491 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 7>,
1492 &NEConvolutionRectangleKernel::convolution<uint8_t, 5, 9>,
1493 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 3>,
1494 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 5>,
1495 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 7>,
1496 &NEConvolutionRectangleKernel::convolution<uint8_t, 7, 9>,
1497 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 3>,
1498 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 5>,
1499 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 7>,
1500 &NEConvolutionRectangleKernel::convolution<uint8_t, 9, 9>
1504 static const std::array<ConvolutionRectangleFunction, 16> func_table_s16 =
1507 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 3>,
1508 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 5>,
1509 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 7>,
1510 &NEConvolutionRectangleKernel::convolution<int16_t, 3, 9>,
1511 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 3>,
1512 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 5>,
1513 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 7>,
1514 &NEConvolutionRectangleKernel::convolution<int16_t, 5, 9>,
1515 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 3>,
1516 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 5>,
1517 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 7>,
1518 &NEConvolutionRectangleKernel::convolution<int16_t, 7, 9>,
1519 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 3>,
1520 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 5>,
1521 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 7>,
1522 &NEConvolutionRectangleKernel::convolution<int16_t, 9, 9>
1531 (this->*func_table_u8[_func_idx])(window);
1535 (this->*func_table_s16[_func_idx])(window);
1542 unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val)
1560 template <
typename OutputType,
unsigned int rows,
unsigned int cols>
1561 void NEConvolutionRectangleKernel::convolution(
const Window &win)
1563 static_assert(
sizeof(OutputType) ==
sizeof(uint8_t) ||
sizeof(OutputType) ==
sizeof(int16_t),
"The output buffer can only be u8 or s16");
1569 std::array<unsigned char *, rows> input_ptrs{ {} };
1570 const int16_t *conv = _convolution.data();
1571 const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale);
1572 const int k_row_half =
rows / 2;
1573 const int k_col_half =
cols / 2;
1576 for(
int i = -k_row_half; i <= k_row_half; ++i)
1583 int32x4_t out = vdupq_n_s32(0);
1584 int32x4_t out2 = vdupq_n_s32(0);
1587 for(
unsigned int r = 0; r <
rows; ++r)
1589 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
1592 convolve_row3x1(out, out2, data, conv + r *
cols);
1596 convolve_row5x1(out, out2, data, conv + r *
cols);
1600 convolve_row7x1(out, out2, data, conv + r *
cols);
1604 convolve_row9x1(out, out2, data, conv + r *
cols);
1616 out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val));
1617 out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val));
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Kernel for the Vertical pass of a Separable Convolution.
Interface for the kernel to run an arbitrary size convolution on a tensor.
void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined)
Initialise the kernel's input, output and border mode.
virtual int32_t offset_element_in_bytes(const Coordinates &pos) const =0
The offset in bytes from the beginning of the memory allocation to access the element at position (x...
BorderSize border_size() const override
The size of the border for that kernel.
void shift(size_t dimension, int shift_value)
Shift the values of a given dimension by the given shift_value.
Container for 2D border size.
void increment(size_t dimension)
Increment the iterator along the specified dimension of the step value associated to the dimension...
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
virtual DataType data_type() const =0
Data type used for each element of the tensor.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
NEConvolutionRectangleKernel()
Default constructor.
1 channel, 1 U16 per channel
Kernel for the Horizontal pass of a Separable Convolution.
Interface for simple C++ kernels having 1 tensor input and 1 tensor output.
uint32_t calculate_matrix_scale(const int16_t *matrix, unsigned int matrix_size)
Calculate the scale of the given square matrix.
Interface for Neon tensor.
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
BorderSize border_size() const override
The size of the border for that kernel.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
1 channel, 1 S32 per channel
Implementation of a rectangular access pattern.
void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined)
Initialise the kernel's input, output and border mode.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
void store_results(float *buffer, const float32x4x2_t &values)
Stores a float32x4x2_t array into a memory location.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Implementation of a row access pattern.
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...)
Kernel for the running convolution on a rectangle matrix.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
BorderSize border_size() const override
The size of the border for that kernel.
NESeparableConvolutionHorKernel()
Default constructor.
NEConvolutionKernel()
Default constructor.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
SimpleTensor< T > convolution(const SimpleTensor< uint8_t > &src, DataType output_data_type, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value, const unsigned int width, const unsigned int height)
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined)
Initialise the kernel's input, output and border mode.
NESeparableConvolutionVertKernel()
Default constructor.
Information about executing thread and CPU.
void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined)
Initialise the kernel's input, output and border mode.
BorderSize border_size() const override
The size of the border for that kernel.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)