Execute the kernel on the passed window.
137 const auto window_start_x =
static_cast<int>(
window.
x().
start());
138 const auto window_end_x =
static_cast<int>(
window.
x().
end());
139 const int window_step_x = 16;
151 Iterator
src(_src, win);
152 Iterator
dst(_dst, win);
154 switch(_src->info()->data_type())
158 switch(_dst->info()->data_type())
165 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
166 const auto dst_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
167 int x = window_start_x;
169 for(; x <= (window_end_x - window_step_x); x += window_step_x)
171 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
173 const int16x8x2_t texels =
176 vmovl_s8(vget_low_s8(texels_s8)),
177 vmovl_s8(vget_high_s8(texels_s8))
181 vst1q_s16(dst_ptr + x, texels.val[0]);
182 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
186 for(; x < window_end_x; ++x)
188 *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
199 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
200 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
201 int x = window_start_x;
203 for(; x <= (window_end_x - window_step_x); x += window_step_x)
205 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
207 const int16x8x2_t texels =
210 vmovl_s8(vget_low_s8(texels_s8)),
211 vmovl_s8(vget_high_s8(texels_s8))
215 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
216 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
217 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
218 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
222 for(; x < window_end_x; ++x)
224 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
235 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
236 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
238 int x = window_start_x;
239 for(; x <= (window_end_x - window_step_x); x += window_step_x)
241 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(
src.ptr()));
243 const int16x8x2_t texels =
246 vmovl_s8(vget_low_s8(texels_s8)),
247 vmovl_s8(vget_high_s8(texels_s8))
250 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
251 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
252 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
253 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
257 for(; x < window_end_x; ++x)
259 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
265 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 271 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
272 const auto dst_ptr =
reinterpret_cast<float16_t *
>(
dst.ptr());
273 int x = window_start_x;
275 for(; x <= (window_end_x - window_step_x); x += window_step_x)
277 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
279 const int16x8x2_t texels =
282 vmovl_s8(vget_low_s8(texels_s8)),
283 vmovl_s8(vget_high_s8(texels_s8))
291 for(; x < window_end_x; ++x)
293 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
299 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 310 switch(_dst->info()->data_type())
317 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
318 const auto dst_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
320 int x = window_start_x;
321 for(; x <= (window_end_x - window_step_x); x += window_step_x)
323 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
325 const int16x8x2_t texels =
328 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
329 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
333 vst1q_s16(dst_ptr + x, texels.val[0]);
334 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
338 for(; x < window_end_x; ++x)
340 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
351 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
352 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
354 int x = window_start_x;
355 for(; x <= (window_end_x - window_step_x); x += window_step_x)
357 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
359 const int16x8x2_t texels =
362 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
363 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
367 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
368 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
369 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
370 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
374 for(; x < window_end_x; ++x)
376 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
387 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
388 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
390 int x = window_start_x;
391 for(; x <= (window_end_x - window_step_x); x += window_step_x)
393 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
395 const int16x8x2_t texels =
398 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
399 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
402 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
403 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
404 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
405 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
409 for(; x < window_end_x; ++x)
411 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
417 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 423 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
424 const auto dst_ptr =
reinterpret_cast<float16_t *
>(
dst.ptr());
426 int x = window_start_x;
427 for(; x <= (window_end_x - window_step_x); x += window_step_x)
429 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
431 const int16x8x2_t texels =
434 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
435 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
443 for(; x < window_end_x; ++x)
445 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
451 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 457 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
458 const auto dst_ptr =
reinterpret_cast<uint16_t *
>(
dst.ptr());
460 int x = window_start_x;
461 for(; x <= (window_end_x - window_step_x); x += window_step_x)
463 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
465 const uint16x8x2_t texels =
468 vmovl_u8(vget_low_u8(texels_u8)),
469 vmovl_u8(vget_high_u8(texels_u8))
473 vst1q_u16(dst_ptr + x, texels.val[0]);
474 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
478 for(; x < window_end_x; ++x)
480 *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
493 switch(_dst->info()->data_type())
502 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
503 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
505 int x = window_start_x;
506 for(; x <= (window_end_x - window_step_x); x += window_step_x)
508 const int16x8x2_t texels =
511 vld1q_s16(src_ptr + x),
512 vld1q_s16(src_ptr + x + 8)
516 vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
520 for(; x < window_end_x; ++x)
522 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
531 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
532 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
534 int x = window_start_x;
535 for(; x <= (window_end_x - window_step_x); x += window_step_x)
537 const int16x8x2_t texels =
540 vld1q_s16(src_ptr + x),
541 vld1q_s16(src_ptr + x + 8)
545 vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
549 for(; x < window_end_x; ++x)
551 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
565 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
566 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
568 int x = window_start_x;
569 for(; x <= (window_end_x - window_step_x); x += window_step_x)
571 const int16x8x2_t texels =
574 vld1q_s16(src_ptr + x),
575 vld1q_s16(src_ptr + x + 8)
579 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
583 for(; x < window_end_x; ++x)
585 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
594 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
595 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
597 int x = window_start_x;
598 for(; x <= (window_end_x - window_step_x); x += window_step_x)
600 const int16x8x2_t texels =
603 vld1q_s16(src_ptr + x),
604 vld1q_s16(src_ptr + x + 8)
608 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
609 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
613 for(; x < window_end_x; ++x)
615 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
627 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
628 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
630 int x = window_start_x;
631 for(; x <= (window_end_x - window_step_x); x += window_step_x)
633 const int16x8x2_t texels =
636 vld1q_s16(src_ptr + x),
637 vld1q_s16(src_ptr + x + 8)
641 const int32x4x4_t texels_s32 =
644 vmovl_s16(vget_low_s16(texels.val[0])),
645 vmovl_s16(vget_high_s16(texels.val[0])),
646 vmovl_s16(vget_low_s16(texels.val[1])),
647 vmovl_s16(vget_high_s16(texels.val[1]))
651 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
652 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
653 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
654 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
658 for(; x < window_end_x; ++x)
660 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
673 switch(_dst->info()->data_type())
682 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
683 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
685 int x = window_start_x;
686 for(; x <= (window_end_x - window_step_x); x += window_step_x)
688 const uint16x8x2_t texels =
691 vld1q_u16(src_ptr + x),
692 vld1q_u16(src_ptr + x + 8)
696 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
700 for(; x < window_end_x; ++x)
702 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
711 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
712 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
714 int x = window_start_x;
715 for(; x <= (window_end_x - window_step_x); x += window_step_x)
717 const uint16x8x2_t texels =
720 vld1q_u16(src_ptr + x),
721 vld1q_u16(src_ptr + x + 8)
725 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
729 for(; x < window_end_x; ++x)
731 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
744 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
745 const auto dst_ptr =
reinterpret_cast<uint32_t *
>(
dst.ptr());
747 int x = window_start_x;
748 for(; x <= (window_end_x - window_step_x); x += window_step_x)
750 const uint16x8x2_t texels =
753 vld1q_u16(src_ptr + x),
754 vld1q_u16(src_ptr + x + 8)
758 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
759 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
760 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
761 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
764 for(; x < window_end_x; ++x)
766 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
778 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 780 switch(_dst->info()->data_type())
787 const auto src_ptr =
reinterpret_cast<const bfloat16 *
>(
src.ptr());
788 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
790 int x = window_start_x;
791 for(; x <= (window_end_x - window_step_x); x += window_step_x)
793 const uint16x8x2_t texels =
796 vld1q_u16(reinterpret_cast<uint16_t *>(
src.ptr())),
797 vld1q_u16(reinterpret_cast<uint16_t *>(
src.ptr()) + 8)
801 vst1q_f32(reinterpret_cast<float *>(
dst.ptr()),
802 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
803 vst1q_f32(reinterpret_cast<float *>(
dst.ptr()) + 4,
804 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
805 vst1q_f32(reinterpret_cast<float *>(
dst.ptr()) + 8,
806 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
807 vst1q_f32(reinterpret_cast<float *>(
dst.ptr()) + 12,
808 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
811 for(; x < window_end_x; ++x)
813 *(dst_ptr + x) =
float(*(src_ptr + x));
824 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 826 switch(_dst->info()->data_type())
833 const auto src_ptr =
reinterpret_cast<const float16_t *
>(
src.ptr());
834 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
836 int x = window_start_x;
837 for(; x <= (window_end_x - window_step_x); x += window_step_x)
839 const float16x8x2_t texels =
842 vld1q_f16(src_ptr + x),
843 vld1q_f16(src_ptr + x + 8),
851 for(; x < window_end_x; ++x)
853 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
865 const auto src_ptr =
reinterpret_cast<const float16_t *
>(
src.ptr());
866 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
868 int x = window_start_x;
869 for(; x <= (window_end_x - window_step_x); x += window_step_x)
871 const float16x8x2_t texels =
874 vld1q_f16(src_ptr + x),
875 vld1q_f16(src_ptr + x + 8),
883 for(; x < window_end_x; ++x)
885 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
897 const auto src_ptr =
reinterpret_cast<const float16_t *
>(
src.ptr());
898 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
900 int x = window_start_x;
901 for(; x <= (window_end_x - window_step_x); x += window_step_x)
903 const float16x8x2_t texels =
906 vld1q_f16(src_ptr + x),
907 vld1q_f16(src_ptr + x + 8)
910 vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
911 vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
912 vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
913 vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
917 for(; x < window_end_x; ++x)
919 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
930 const auto src_ptr =
reinterpret_cast<const float16_t *
>(
src.ptr());
931 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
933 int x = window_start_x;
934 for(; x <= (window_end_x - window_step_x); x += window_step_x)
936 const float16x8x2_t texels =
939 vld1q_f16(src_ptr + x),
940 vld1q_f16(src_ptr + x + 8)
944 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
945 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
946 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
947 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
951 for(; x < window_end_x; ++x)
953 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
965 switch(_dst->info()->data_type())
967 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 973 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
974 const auto dst_ptr =
reinterpret_cast<float16_t *
>(
dst.ptr());
976 int x = window_start_x;
977 for(; x <= (window_end_x - window_step_x); x += window_step_x)
979 const float32x4x4_t texels =
982 vld1q_f32(src_ptr + x),
983 vld1q_f32(src_ptr + x + 4),
984 vld1q_f32(src_ptr + x + 8),
985 vld1q_f32(src_ptr + x + 12)
989 vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
990 vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
994 for(; x < window_end_x; ++x)
996 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
1003 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 1009 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
1010 const auto dst_ptr =
reinterpret_cast<bfloat16 *
>(
dst.ptr());
1012 int x = window_start_x;
1013 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1015 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(
src.ptr()),
1016 reinterpret_cast<uint16_t *>(
dst.ptr()));
1017 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(
src.ptr()) + 8,
1018 reinterpret_cast<uint16_t *>(
dst.ptr()) + 8);
1021 for(; x < window_end_x; ++x)
1023 *(dst_ptr + x) = *(src_ptr + x);
1035 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
1036 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
1038 int x = window_start_x;
1039 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1041 const float32x4x4_t texels =
1044 vld1q_f32(src_ptr + x),
1045 vld1q_f32(src_ptr + x + 4),
1046 vld1q_f32(src_ptr + x + 8),
1047 vld1q_f32(src_ptr + x + 12),
1051 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
1052 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1053 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1054 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1058 for(; x < window_end_x; ++x)
1060 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
1072 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
1073 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1075 int x = window_start_x;
1076 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1078 const float32x4x4_t texels =
1081 vld1q_f32(src_ptr + x),
1082 vld1q_f32(src_ptr + x + 4),
1083 vld1q_f32(src_ptr + x + 8),
1084 vld1q_f32(src_ptr + x + 12),
1088 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1089 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1093 for(; x < window_end_x; ++x)
1095 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1106 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
1107 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1109 int x = window_start_x;
1110 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1112 const float32x4x4_t texels =
1115 vld1q_f32(src_ptr + x),
1116 vld1q_f32(src_ptr + x + 4),
1117 vld1q_f32(src_ptr + x + 8),
1118 vld1q_f32(src_ptr + x + 12),
1122 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1123 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1126 for(; x < window_end_x; ++x)
1128 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1141 switch(_dst->info()->data_type())
1143 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1149 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1150 const auto dst_ptr =
reinterpret_cast<float16_t *
>(
dst.ptr());
1152 int x = window_start_x;
1153 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1155 const float32x4x4_t texels =
1158 vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
1159 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
1160 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
1161 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
1165 vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1166 vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1170 for(; x < window_end_x; ++x)
1172 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
1184 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1185 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
1187 int x = window_start_x;
1188 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1190 const int32x4x4_t texels =
1193 vld1q_s32(src_ptr + x),
1194 vld1q_s32(src_ptr + x + 4),
1195 vld1q_s32(src_ptr + x + 8),
1196 vld1q_s32(src_ptr + x + 12),
1200 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
1201 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1202 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1203 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1207 for(; x < window_end_x; ++x)
1209 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
1222 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1223 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1225 int x = window_start_x;
1226 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1228 const int32x4x4_t texels =
1231 vld1q_s32(src_ptr + x),
1232 vld1q_s32(src_ptr + x + 4),
1233 vld1q_s32(src_ptr + x + 8),
1234 vld1q_s32(src_ptr + x + 12),
1237 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1238 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1242 for(; x < window_end_x; ++x)
1244 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1253 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1254 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1256 int x = window_start_x;
1257 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1259 const int32x4x4_t texels =
1262 vld1q_s32(src_ptr + x),
1263 vld1q_s32(src_ptr + x + 4),
1264 vld1q_s32(src_ptr + x + 8),
1265 vld1q_s32(src_ptr + x + 12)
1269 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1270 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1274 for(; x < window_end_x; ++x)
1276 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1291 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1292 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1294 int x = window_start_x;
1295 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1297 const int32x4x4_t texels =
1300 vld1q_s32(src_ptr + x),
1301 vld1q_s32(src_ptr + x + 4),
1302 vld1q_s32(src_ptr + x + 8),
1303 vld1q_s32(src_ptr + x + 12)
1306 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1307 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1311 for(; x < window_end_x; ++x)
1313 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1322 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1323 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1325 int x = window_start_x;
1326 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1328 const int32x4x4_t texels =
1331 vld1q_s32(src_ptr + x),
1332 vld1q_s32(src_ptr + x + 4),
1333 vld1q_s32(src_ptr + x + 8),
1334 vld1q_s32(src_ptr + x + 12)
1338 vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1339 vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1343 for(; x < window_end_x; ++x)
1345 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
uint16x8_t vcvtq_f16_s16(float16x8_t)
const Window & window() const
The maximum window the kernel can be executed on.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
1 channel, 1 U16 per channel
SimpleTensor< float > src
1 channel, 1 F16 per channel
arm_compute::bfloat16 bfloat16
1 channel, 1 S32 per channel
16-bit brain floating-point number
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
quantized, asymmetric fixed-point 8-bit number unsigned
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
int16x8_t vcvtq_s16_f16(float16x8_t)
quantized, asymmetric fixed-point 8-bit number signed
constexpr int end() const
Return the end of the dimension.
constexpr int start() const
Return the start of the dimension.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.