Execute the kernel on the passed window.
277 const auto window_start_x =
static_cast<int>(
window.
x().
start());
278 const auto window_end_x =
static_cast<int>(
window.
x().
end());
279 const int window_step_x = 16;
291 Iterator
src(_src, win);
292 Iterator
dst(_dst, win);
298 switch (_src->info()->data_type())
303 switch (_dst->info()->data_type())
307 convert64<uint64_t, float>(
src,
dst, win, window_start_x, window_end_x, window_step_x);
317 switch (_dst->info()->data_type())
321 convert64<int64_t, float>(
src,
dst, win, window_start_x, window_end_x, window_step_x);
329 #endif // __aarch64__
333 switch (_dst->info()->data_type())
340 [&](
const Coordinates &)
342 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
343 const auto dst_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
344 int x = window_start_x;
346 for (; x <= (window_end_x - window_step_x); x += window_step_x)
348 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
350 const int16x8x2_t texels = {
351 {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
353 vst1q_s16(dst_ptr + x, texels.val[0]);
354 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
358 for (; x < window_end_x; ++x)
360 *(dst_ptr + x) =
static_cast<int16_t
>(*(src_ptr + x));
371 [&](
const Coordinates &)
373 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
374 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
375 int x = window_start_x;
377 for (; x <= (window_end_x - window_step_x); x += window_step_x)
379 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
381 const int16x8x2_t texels = {
382 {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
384 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
385 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
386 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
387 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
391 for (; x < window_end_x; ++x)
393 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
404 [&](
const Coordinates &)
406 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
407 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
409 int x = window_start_x;
410 for (; x <= (window_end_x - window_step_x); x += window_step_x)
412 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
414 const int16x8x2_t texels = {
415 {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
416 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
417 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
418 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
419 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
423 for (; x < window_end_x; ++x)
425 *(dst_ptr + x) =
static_cast<float>(*(src_ptr + x));
435 uk->ukernel(_src, _dst,
info, _policy,
window);
447 switch (_dst->info()->data_type())
454 [&](
const Coordinates &)
456 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
457 const auto dst_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
459 int x = window_start_x;
460 for (; x <= (window_end_x - window_step_x); x += window_step_x)
462 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
464 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
465 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
467 vst1q_s16(dst_ptr + x, texels.val[0]);
468 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
472 for (; x < window_end_x; ++x)
474 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
485 [&](
const Coordinates &)
487 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
488 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
490 int x = window_start_x;
491 for (; x <= (window_end_x - window_step_x); x += window_step_x)
493 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
495 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
496 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
498 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
499 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
500 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
501 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
505 for (; x < window_end_x; ++x)
507 *(dst_ptr + x) =
static_cast<uint32_t
>(*(src_ptr + x));
518 [&](
const Coordinates &)
520 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
521 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
523 int x = window_start_x;
524 for (; x <= (window_end_x - window_step_x); x += window_step_x)
526 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
528 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
529 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
530 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
531 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
532 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
533 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
537 for (; x < window_end_x; ++x)
539 *(dst_ptr + x) =
static_cast<uint32_t
>(*(src_ptr + x));
549 uk->ukernel(_src, _dst,
info, _policy,
window);
557 [&](
const Coordinates &)
559 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
560 const auto dst_ptr =
reinterpret_cast<uint16_t *
>(
dst.ptr());
562 int x = window_start_x;
563 for (; x <= (window_end_x - window_step_x); x += window_step_x)
565 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
567 const uint16x8x2_t texels = {
568 {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
570 vst1q_u16(dst_ptr + x, texels.val[0]);
571 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
575 for (; x < window_end_x; ++x)
577 *(dst_ptr + x) =
static_cast<uint16_t
>(*(src_ptr + x));
590 switch (_dst->info()->data_type())
599 [&](
const Coordinates &)
601 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
602 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
604 int x = window_start_x;
605 for (; x <= (window_end_x - window_step_x); x += window_step_x)
607 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
609 vst1q_s8(dst_ptr + x,
610 vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
614 for (; x < window_end_x; ++x)
616 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
625 [&](
const Coordinates &)
627 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
628 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
630 int x = window_start_x;
631 for (; x <= (window_end_x - window_step_x); x += window_step_x)
633 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
635 vst1q_s8(dst_ptr + x,
636 vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
640 for (; x < window_end_x; ++x)
642 *(dst_ptr + x) =
static_cast<int8_t
>(*(src_ptr + x));
656 [&](
const Coordinates &)
658 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
659 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
661 int x = window_start_x;
662 for (; x <= (window_end_x - window_step_x); x += window_step_x)
664 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
666 vst1q_u8(dst_ptr + x,
667 vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
671 for (; x < window_end_x; ++x)
673 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
682 [&](
const Coordinates &)
684 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
685 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
687 int x = window_start_x;
688 for (; x <= (window_end_x - window_step_x); x += window_step_x)
690 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
692 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
693 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
697 for (; x < window_end_x; ++x)
699 *(dst_ptr + x) =
static_cast<uint8_t
>(*(src_ptr + x));
711 [&](
const Coordinates &)
713 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
714 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
716 int x = window_start_x;
717 for (; x <= (window_end_x - window_step_x); x += window_step_x)
719 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
721 const int32x4x4_t texels_s32 = {
722 {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
723 vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
725 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
726 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
727 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
728 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
732 for (; x < window_end_x; ++x)
734 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
748 switch (_dst->info()->data_type())
757 [&](
const Coordinates &)
759 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
760 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
762 int x = window_start_x;
763 for (; x <= (window_end_x - window_step_x); x += window_step_x)
765 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
767 vst1q_u8(dst_ptr + x,
768 vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
772 for (; x < window_end_x; ++x)
774 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
783 [&](
const Coordinates &)
785 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
786 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
788 int x = window_start_x;
789 for (; x <= (window_end_x - window_step_x); x += window_step_x)
791 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
793 vst1q_u8(dst_ptr + x,
794 vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
798 for (; x < window_end_x; ++x)
800 *(dst_ptr + x) =
static_cast<uint8_t
>(*(src_ptr + x));
812 [&](
const Coordinates &)
814 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
815 const auto dst_ptr =
reinterpret_cast<uint32_t *
>(
dst.ptr());
817 int x = window_start_x;
818 for (; x <= (window_end_x - window_step_x); x += window_step_x)
820 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
822 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
823 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
824 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
825 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
828 for (; x < window_end_x; ++x)
830 *(dst_ptr + x) =
static_cast<uint32_t
>(*(src_ptr + x));
845 uk->ukernel(_src, _dst,
info, _policy,
window);
849 switch (_dst->info()->data_type())
855 uk->ukernel(_src, _dst,
info, _policy,
window);
863 [&](
const Coordinates &)
865 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
866 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
868 int x = window_start_x;
869 for (; x <= (window_end_x - window_step_x); x += window_step_x)
871 const float32x4x4_t texels = {{
872 vld1q_f32(src_ptr + x),
873 vld1q_f32(src_ptr + x + 4),
874 vld1q_f32(src_ptr + x + 8),
875 vld1q_f32(src_ptr + x + 12),
878 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
879 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
880 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
881 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
885 for (; x < window_end_x; ++x)
887 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
899 [&](
const Coordinates &)
901 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
902 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
904 int x = window_start_x;
905 for (; x <= (window_end_x - window_step_x); x += window_step_x)
907 const float32x4x4_t texels = {{
908 vld1q_f32(src_ptr + x),
909 vld1q_f32(src_ptr + x + 4),
910 vld1q_f32(src_ptr + x + 8),
911 vld1q_f32(src_ptr + x + 12),
915 vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
916 vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
917 vst1_u8(dst_ptr + x + 8,
918 vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
919 vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
923 for (; x < window_end_x; ++x)
925 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
936 [&](
const Coordinates &)
938 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
939 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
941 int x = window_start_x;
942 for (; x <= (window_end_x - window_step_x); x += window_step_x)
944 const float32x4x4_t texels = {{
945 vld1q_f32(src_ptr + x),
946 vld1q_f32(src_ptr + x + 4),
947 vld1q_f32(src_ptr + x + 8),
948 vld1q_f32(src_ptr + x + 12),
952 vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
953 vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
954 vst1_s8(dst_ptr + x + 8,
955 vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
956 vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
959 for (; x < window_end_x; ++x)
961 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
973 switch (_dst->info()->data_type())
978 convert64<int32_t, int64_t>(
src,
dst, win, window_start_x, window_end_x, window_step_x);
981 #endif // __aarch64__
986 uk->ukernel(_src, _dst,
info, _policy,
window);
994 [&](
const Coordinates &)
996 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
997 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
999 int x = window_start_x;
1000 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1002 const int32x4x4_t texels = {{
1003 vld1q_s32(src_ptr + x),
1004 vld1q_s32(src_ptr + x + 4),
1005 vld1q_s32(src_ptr + x + 8),
1006 vld1q_s32(src_ptr + x + 12),
1009 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
1010 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1011 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1012 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1016 for (; x < window_end_x; ++x)
1018 *(dst_ptr + x) =
static_cast<float>(*(src_ptr + x));
1031 [&](
const Coordinates &)
1033 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1034 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1036 int x = window_start_x;
1037 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1039 const int32x4x4_t texels = {{
1040 vld1q_s32(src_ptr + x),
1041 vld1q_s32(src_ptr + x + 4),
1042 vld1q_s32(src_ptr + x + 8),
1043 vld1q_s32(src_ptr + x + 12),
1045 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
1046 vqmovn_s32(texels.val[1]))));
1047 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
1048 vqmovn_s32(texels.val[3]))));
1052 for (; x < window_end_x; ++x)
1054 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1063 [&](
const Coordinates &)
1065 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1066 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1068 int x = window_start_x;
1069 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1071 const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
1072 vld1q_s32(src_ptr + x + 8),
1073 vld1q_s32(src_ptr + x + 12)}};
1075 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
1076 vmovn_s32(texels.val[1]))));
1077 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
1078 vmovn_s32(texels.val[3]))));
1082 for (; x < window_end_x; ++x)
1084 *(dst_ptr + x) =
static_cast<int8_t
>(*(src_ptr + x));
1099 [&](
const Coordinates &)
1101 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1102 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1104 int x = window_start_x;
1105 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1107 const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
1108 vld1q_s32(src_ptr + x + 8),
1109 vld1q_s32(src_ptr + x + 12)}};
1110 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
1111 vqmovun_s32(texels.val[1]))));
1112 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
1113 vqmovun_s32(texels.val[3]))));
1117 for (; x < window_end_x; ++x)
1119 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1128 [&](
const Coordinates &)
1130 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1131 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1133 int x = window_start_x;
1134 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1136 const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
1137 vld1q_s32(src_ptr + x + 8),
1138 vld1q_s32(src_ptr + x + 12)}};
1140 vst1_u8(dst_ptr + x,
1141 vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
1142 vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1143 vst1_u8(dst_ptr + x + 8,
1144 vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
1145 vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1149 for (; x < window_end_x; ++x)
1151 *(dst_ptr + x) =
static_cast<uint8_t
>(*(src_ptr + x));