50 static const std::vector<CpuCastKernel::CastKernel> available_kernels = {
52 [](
const CastDataTypeISASelectorData &data)
56 [](
const CastDataTypeISASelectorData &data)
60 [](
const CastDataTypeISASelectorData &data)
64 [](
const CastDataTypeISASelectorData &data) {
return data.src_dt ==
DataType::F16 && data.isa.fp16; },
66 {
"neon_fp32_to_fp16_cast",
67 [](
const CastDataTypeISASelectorData &data)
71 [](
const CastDataTypeISASelectorData &data)
104 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
110 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
116 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
120 "Only data_types supported [in] U16 -> [out] U8, U32");
125 "Only data_types supported [in] S16 -> [out] U8, S32");
131 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
137 "Only data_types supported [in] F32 -> [out] QASYMM8, F16, S32, U8");
144 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8, S64");
147 "Only data_types supported [in] S64 -> [out] F32");
150 "Only data_types supported [in] U64 -> [out] F32");
151 #endif // __aarch64__
154 if (
dst->total_size() > 0)
177 ICPPKernel::configure(win);
188 template <
typename T1,
typename T2>
189 inline void internal_neon_convert(
const T1 *src_ptr, T2 *dst_ptr)
196 inline void internal_neon_convert<int32_t, int64_t>(
const int32_t *src_ptr, int64_t *dst_ptr)
198 const int32x4x4_t texels = {
199 {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}};
200 vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0])));
201 vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0])));
202 vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1])));
203 vst1q_s64(dst_ptr + 6, vmovl_s32(vget_high_s32(texels.val[1])));
204 vst1q_s64(dst_ptr + 8, vmovl_s32(vget_low_s32(texels.val[2])));
205 vst1q_s64(dst_ptr + 10, vmovl_s32(vget_high_s32(texels.val[2])));
206 vst1q_s64(dst_ptr + 12, vmovl_s32(vget_low_s32(texels.val[3])));
207 vst1q_s64(dst_ptr + 14, vmovl_s32(vget_high_s32(texels.val[3])));
211 inline void internal_neon_convert<int64_t, float>(
const int64_t *src_ptr,
float *dst_ptr)
213 const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
214 vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}};
215 const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
216 vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}};
217 const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
218 vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
219 vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
220 vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
221 vst1q_f32(dst_ptr, texels.val[0]);
222 vst1q_f32(dst_ptr + 4, texels.val[1]);
223 vst1q_f32(dst_ptr + 8, texels.val[2]);
224 vst1q_f32(dst_ptr + 12, texels.val[3]);
228 inline void internal_neon_convert<uint64_t, float>(
const uint64_t *src_ptr,
float *dst_ptr)
230 const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
231 vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}};
232 const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
233 vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}};
235 const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
236 vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
237 vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
238 vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
240 vst1q_f32(dst_ptr, texels.val[0]);
241 vst1q_f32(dst_ptr + 4, texels.val[1]);
242 vst1q_f32(dst_ptr + 8, texels.val[2]);
243 vst1q_f32(dst_ptr + 12, texels.val[3]);
246 template <
typename T1,
typename T2>
248 convert64(Iterator &
src, Iterator &
dst,
const Window &win,
int window_start_x,
int window_end_x,
int window_step_x)
252 [&](
const Coordinates &)
254 const auto src_ptr =
reinterpret_cast<const T1 *
>(
src.ptr());
255 const auto dst_ptr =
reinterpret_cast<T2 *
>(
dst.ptr());
256 int x = window_start_x;
257 for (; x <= (window_end_x - window_step_x); x += window_step_x)
259 internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
261 for (; x < window_end_x; ++x)
263 *(dst_ptr + x) =
static_cast<T2
>(*(src_ptr + x));
269 #endif // __aarch64__
277 const auto window_start_x =
static_cast<int>(
window.
x().
start());
278 const auto window_end_x =
static_cast<int>(
window.
x().
end());
279 const int window_step_x = 16;
307 convert64<uint64_t, float>(
src,
dst, win, window_start_x, window_end_x, window_step_x);
321 convert64<int64_t, float>(
src,
dst, win, window_start_x, window_end_x, window_step_x);
329 #endif // __aarch64__
342 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
343 const auto dst_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
344 int x = window_start_x;
346 for (; x <= (window_end_x - window_step_x); x += window_step_x)
348 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
350 const int16x8x2_t texels = {
351 {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
353 vst1q_s16(dst_ptr + x, texels.val[0]);
354 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
358 for (; x < window_end_x; ++x)
360 *(dst_ptr + x) =
static_cast<int16_t
>(*(src_ptr + x));
373 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
374 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
375 int x = window_start_x;
377 for (; x <= (window_end_x - window_step_x); x += window_step_x)
379 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
381 const int16x8x2_t texels = {
382 {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
384 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
385 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
386 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
387 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
391 for (; x < window_end_x; ++x)
393 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
406 const auto src_ptr =
reinterpret_cast<const int8_t *
>(
src.ptr());
407 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
409 int x = window_start_x;
410 for (; x <= (window_end_x - window_step_x); x += window_step_x)
412 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
414 const int16x8x2_t texels = {
415 {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
416 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
417 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
418 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
419 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
423 for (; x < window_end_x; ++x)
425 *(dst_ptr + x) =
static_cast<float>(*(src_ptr + x));
435 uk->ukernel(_src, _dst,
info, _policy,
window);
456 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
457 const auto dst_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
459 int x = window_start_x;
460 for (; x <= (window_end_x - window_step_x); x += window_step_x)
462 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
464 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
465 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
467 vst1q_s16(dst_ptr + x, texels.val[0]);
468 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
472 for (; x < window_end_x; ++x)
474 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
487 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
488 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
490 int x = window_start_x;
491 for (; x <= (window_end_x - window_step_x); x += window_step_x)
493 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
495 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
496 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
498 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
499 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
500 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
501 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
505 for (; x < window_end_x; ++x)
507 *(dst_ptr + x) =
static_cast<uint32_t
>(*(src_ptr + x));
520 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
521 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
523 int x = window_start_x;
524 for (; x <= (window_end_x - window_step_x); x += window_step_x)
526 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
528 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
529 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
530 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
531 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
532 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
533 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
537 for (; x < window_end_x; ++x)
539 *(dst_ptr + x) =
static_cast<uint32_t
>(*(src_ptr + x));
549 uk->ukernel(_src, _dst,
info, _policy,
window);
559 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(
src.ptr());
560 const auto dst_ptr =
reinterpret_cast<uint16_t *
>(
dst.ptr());
562 int x = window_start_x;
563 for (; x <= (window_end_x - window_step_x); x += window_step_x)
565 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
567 const uint16x8x2_t texels = {
568 {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
570 vst1q_u16(dst_ptr + x, texels.val[0]);
571 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
575 for (; x < window_end_x; ++x)
577 *(dst_ptr + x) =
static_cast<uint16_t
>(*(src_ptr + x));
601 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
602 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
604 int x = window_start_x;
605 for (; x <= (window_end_x - window_step_x); x += window_step_x)
607 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
609 vst1q_s8(dst_ptr + x,
610 vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
614 for (; x < window_end_x; ++x)
616 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
627 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
628 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
630 int x = window_start_x;
631 for (; x <= (window_end_x - window_step_x); x += window_step_x)
633 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
635 vst1q_s8(dst_ptr + x,
636 vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
640 for (; x < window_end_x; ++x)
642 *(dst_ptr + x) =
static_cast<int8_t
>(*(src_ptr + x));
658 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
659 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
661 int x = window_start_x;
662 for (; x <= (window_end_x - window_step_x); x += window_step_x)
664 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
666 vst1q_u8(dst_ptr + x,
667 vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
671 for (; x < window_end_x; ++x)
673 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
684 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
685 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
687 int x = window_start_x;
688 for (; x <= (window_end_x - window_step_x); x += window_step_x)
690 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
692 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
693 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
697 for (; x < window_end_x; ++x)
699 *(dst_ptr + x) =
static_cast<uint8_t
>(*(src_ptr + x));
713 const auto src_ptr =
reinterpret_cast<const int16_t *
>(
src.ptr());
714 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
716 int x = window_start_x;
717 for (; x <= (window_end_x - window_step_x); x += window_step_x)
719 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
721 const int32x4x4_t texels_s32 = {
722 {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
723 vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
725 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
726 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
727 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
728 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
732 for (; x < window_end_x; ++x)
734 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
759 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
760 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
762 int x = window_start_x;
763 for (; x <= (window_end_x - window_step_x); x += window_step_x)
765 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
767 vst1q_u8(dst_ptr + x,
768 vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
772 for (; x < window_end_x; ++x)
774 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
785 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
786 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
788 int x = window_start_x;
789 for (; x <= (window_end_x - window_step_x); x += window_step_x)
791 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
793 vst1q_u8(dst_ptr + x,
794 vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
798 for (; x < window_end_x; ++x)
800 *(dst_ptr + x) =
static_cast<uint8_t
>(*(src_ptr + x));
814 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(
src.ptr());
815 const auto dst_ptr =
reinterpret_cast<uint32_t *
>(
dst.ptr());
817 int x = window_start_x;
818 for (; x <= (window_end_x - window_step_x); x += window_step_x)
820 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
822 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
823 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
824 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
825 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
828 for (; x < window_end_x; ++x)
830 *(dst_ptr + x) =
static_cast<uint32_t
>(*(src_ptr + x));
845 uk->ukernel(_src, _dst,
info, _policy,
window);
855 uk->ukernel(_src, _dst,
info, _policy,
window);
865 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
866 const auto dst_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
868 int x = window_start_x;
869 for (; x <= (window_end_x - window_step_x); x += window_step_x)
871 const float32x4x4_t texels = {{
872 vld1q_f32(src_ptr + x),
873 vld1q_f32(src_ptr + x + 4),
874 vld1q_f32(src_ptr + x + 8),
875 vld1q_f32(src_ptr + x + 12),
878 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
879 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
880 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
881 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
885 for (; x < window_end_x; ++x)
887 *(dst_ptr + x) =
static_cast<int32_t
>(*(src_ptr + x));
901 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
902 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
904 int x = window_start_x;
905 for (; x <= (window_end_x - window_step_x); x += window_step_x)
907 const float32x4x4_t texels = {{
908 vld1q_f32(src_ptr + x),
909 vld1q_f32(src_ptr + x + 4),
910 vld1q_f32(src_ptr + x + 8),
911 vld1q_f32(src_ptr + x + 12),
915 vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
916 vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
917 vst1_u8(dst_ptr + x + 8,
918 vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
919 vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
923 for (; x < window_end_x; ++x)
925 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
938 const auto src_ptr =
reinterpret_cast<const float *
>(
src.ptr());
939 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
941 int x = window_start_x;
942 for (; x <= (window_end_x - window_step_x); x += window_step_x)
944 const float32x4x4_t texels = {{
945 vld1q_f32(src_ptr + x),
946 vld1q_f32(src_ptr + x + 4),
947 vld1q_f32(src_ptr + x + 8),
948 vld1q_f32(src_ptr + x + 12),
952 vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
953 vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
954 vst1_s8(dst_ptr + x + 8,
955 vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
956 vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
959 for (; x < window_end_x; ++x)
961 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
978 convert64<int32_t, int64_t>(
src,
dst, win, window_start_x, window_end_x, window_step_x);
981 #endif // __aarch64__
986 uk->ukernel(_src, _dst,
info, _policy,
window);
996 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
997 const auto dst_ptr =
reinterpret_cast<float *
>(
dst.ptr());
999 int x = window_start_x;
1000 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1002 const int32x4x4_t texels = {{
1003 vld1q_s32(src_ptr + x),
1004 vld1q_s32(src_ptr + x + 4),
1005 vld1q_s32(src_ptr + x + 8),
1006 vld1q_s32(src_ptr + x + 12),
1009 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
1010 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1011 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1012 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1016 for (; x < window_end_x; ++x)
1018 *(dst_ptr + x) =
static_cast<float>(*(src_ptr + x));
1033 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1034 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1036 int x = window_start_x;
1037 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1039 const int32x4x4_t texels = {{
1040 vld1q_s32(src_ptr + x),
1041 vld1q_s32(src_ptr + x + 4),
1042 vld1q_s32(src_ptr + x + 8),
1043 vld1q_s32(src_ptr + x + 12),
1045 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
1046 vqmovn_s32(texels.val[1]))));
1047 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
1048 vqmovn_s32(texels.val[3]))));
1052 for (; x < window_end_x; ++x)
1054 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1065 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1066 const auto dst_ptr =
reinterpret_cast<int8_t *
>(
dst.ptr());
1068 int x = window_start_x;
1069 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1071 const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
1072 vld1q_s32(src_ptr + x + 8),
1073 vld1q_s32(src_ptr + x + 12)}};
1075 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
1076 vmovn_s32(texels.val[1]))));
1077 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
1078 vmovn_s32(texels.val[3]))));
1082 for (; x < window_end_x; ++x)
1084 *(dst_ptr + x) =
static_cast<int8_t
>(*(src_ptr + x));
1101 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1102 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1104 int x = window_start_x;
1105 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1107 const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
1108 vld1q_s32(src_ptr + x + 8),
1109 vld1q_s32(src_ptr + x + 12)}};
1110 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
1111 vqmovun_s32(texels.val[1]))));
1112 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
1113 vqmovun_s32(texels.val[3]))));
1117 for (; x < window_end_x; ++x)
1119 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1130 const auto src_ptr =
reinterpret_cast<const int32_t *
>(
src.ptr());
1131 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
1133 int x = window_start_x;
1134 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1136 const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
1137 vld1q_s32(src_ptr + x + 8),
1138 vld1q_s32(src_ptr + x + 12)}};
1140 vst1_u8(dst_ptr + x,
1141 vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
1142 vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1143 vst1_u8(dst_ptr + x + 8,
1144 vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
1145 vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1149 for (; x < window_end_x; ++x)
1151 *(dst_ptr + x) =
static_cast<uint8_t
>(*(src_ptr + x));
1169 return "CpuCastKernel.cpp";
1174 return available_kernels;