47 Status validate_arguments(
const ITensorInfo *
src,
const ITensorInfo *
dst,
ConvertPolicy policy)
64 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
68 "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
72 "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
75 "Only data_types supported [in] U16 -> [out] U8, U32");
78 "Only data_types supported [in] S16 -> [out] U8, S32");
81 "Only data_types supported [in] BFLOAT16 -> [out] F32");
86 "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
91 "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
96 "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
99 if(dst->total_size() > 0)
122 ICPPKernel::configure(win);
137 const auto window_start_x =
static_cast<int>(window.
x().
start());
138 const auto window_end_x =
static_cast<int>(window.
x().
end());
139 const int window_step_x = 16;
165 const auto src_ptr =
reinterpret_cast<const int8_t *
>(src.
ptr());
166 const auto dst_ptr =
reinterpret_cast<int16_t *
>(dst.
ptr());
167 int x = window_start_x;
169 for(; x <= (window_end_x - window_step_x); x += window_step_x)
171 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
173 const int16x8x2_t texels =
176 vmovl_s8(vget_low_s8(texels_s8)),
177 vmovl_s8(vget_high_s8(texels_s8))
181 vst1q_s16(dst_ptr + x, texels.val[0]);
182 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
186 for(; x < window_end_x; ++x)
188 *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
199 const auto src_ptr =
reinterpret_cast<const int8_t *
>(src.
ptr());
200 const auto dst_ptr =
reinterpret_cast<int32_t *
>(dst.
ptr());
201 int x = window_start_x;
203 for(; x <= (window_end_x - window_step_x); x += window_step_x)
205 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
207 const int16x8x2_t texels =
210 vmovl_s8(vget_low_s8(texels_s8)),
211 vmovl_s8(vget_high_s8(texels_s8))
215 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
216 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
217 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
218 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
222 for(; x < window_end_x; ++x)
224 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
235 const auto src_ptr =
reinterpret_cast<const int8_t *
>(src.
ptr());
236 const auto dst_ptr =
reinterpret_cast<float *
>(dst.
ptr());
238 int x = window_start_x;
239 for(; x <= (window_end_x - window_step_x); x += window_step_x)
241 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.
ptr()));
243 const int16x8x2_t texels =
246 vmovl_s8(vget_low_s8(texels_s8)),
247 vmovl_s8(vget_high_s8(texels_s8))
250 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
251 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
252 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
253 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
257 for(; x < window_end_x; ++x)
259 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
265 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 271 const auto src_ptr =
reinterpret_cast<const int8_t *
>(src.
ptr());
272 const auto dst_ptr =
reinterpret_cast<float16_t *
>(dst.
ptr());
273 int x = window_start_x;
275 for(; x <= (window_end_x - window_step_x); x += window_step_x)
277 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
279 const int16x8x2_t texels =
282 vmovl_s8(vget_low_s8(texels_s8)),
283 vmovl_s8(vget_high_s8(texels_s8))
291 for(; x < window_end_x; ++x)
293 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
299 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 317 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(src.
ptr());
318 const auto dst_ptr =
reinterpret_cast<int16_t *
>(dst.
ptr());
320 int x = window_start_x;
321 for(; x <= (window_end_x - window_step_x); x += window_step_x)
323 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
325 const int16x8x2_t texels =
328 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
329 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
333 vst1q_s16(dst_ptr + x, texels.val[0]);
334 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
338 for(; x < window_end_x; ++x)
340 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
351 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(src.
ptr());
352 const auto dst_ptr =
reinterpret_cast<int32_t *
>(dst.
ptr());
354 int x = window_start_x;
355 for(; x <= (window_end_x - window_step_x); x += window_step_x)
357 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
359 const int16x8x2_t texels =
362 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
363 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
367 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
368 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
369 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
370 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
374 for(; x < window_end_x; ++x)
376 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
387 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(src.
ptr());
388 const auto dst_ptr =
reinterpret_cast<float *
>(dst.
ptr());
390 int x = window_start_x;
391 for(; x <= (window_end_x - window_step_x); x += window_step_x)
393 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
395 const int16x8x2_t texels =
398 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
399 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
402 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
403 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
404 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
405 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
409 for(; x < window_end_x; ++x)
411 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
417 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 423 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(src.
ptr());
424 const auto dst_ptr =
reinterpret_cast<float16_t *
>(dst.
ptr());
426 int x = window_start_x;
427 for(; x <= (window_end_x - window_step_x); x += window_step_x)
429 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
431 const int16x8x2_t texels =
434 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
435 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
443 for(; x < window_end_x; ++x)
445 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
451 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 457 const auto src_ptr =
reinterpret_cast<const uint8_t *
>(src.
ptr());
458 const auto dst_ptr =
reinterpret_cast<uint16_t *
>(dst.
ptr());
460 int x = window_start_x;
461 for(; x <= (window_end_x - window_step_x); x += window_step_x)
463 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
465 const uint16x8x2_t texels =
468 vmovl_u8(vget_low_u8(texels_u8)),
469 vmovl_u8(vget_high_u8(texels_u8))
473 vst1q_u16(dst_ptr + x, texels.val[0]);
474 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
478 for(; x < window_end_x; ++x)
480 *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
502 const auto src_ptr =
reinterpret_cast<const int16_t *
>(src.
ptr());
503 const auto dst_ptr =
reinterpret_cast<int8_t *
>(dst.
ptr());
505 int x = window_start_x;
506 for(; x <= (window_end_x - window_step_x); x += window_step_x)
508 const int16x8x2_t texels =
511 vld1q_s16(src_ptr + x),
512 vld1q_s16(src_ptr + x + 8)
516 vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
520 for(; x < window_end_x; ++x)
522 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
531 const auto src_ptr =
reinterpret_cast<const int16_t *
>(src.
ptr());
532 const auto dst_ptr =
reinterpret_cast<int8_t *
>(dst.
ptr());
534 int x = window_start_x;
535 for(; x <= (window_end_x - window_step_x); x += window_step_x)
537 const int16x8x2_t texels =
540 vld1q_s16(src_ptr + x),
541 vld1q_s16(src_ptr + x + 8)
545 vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
549 for(; x < window_end_x; ++x)
551 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
565 const auto src_ptr =
reinterpret_cast<const int16_t *
>(src.
ptr());
566 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
568 int x = window_start_x;
569 for(; x <= (window_end_x - window_step_x); x += window_step_x)
571 const int16x8x2_t texels =
574 vld1q_s16(src_ptr + x),
575 vld1q_s16(src_ptr + x + 8)
579 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
583 for(; x < window_end_x; ++x)
585 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
594 const auto src_ptr =
reinterpret_cast<const int16_t *
>(src.
ptr());
595 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
597 int x = window_start_x;
598 for(; x <= (window_end_x - window_step_x); x += window_step_x)
600 const int16x8x2_t texels =
603 vld1q_s16(src_ptr + x),
604 vld1q_s16(src_ptr + x + 8)
608 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
609 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
613 for(; x < window_end_x; ++x)
615 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
627 const auto src_ptr =
reinterpret_cast<const int16_t *
>(src.
ptr());
628 const auto dst_ptr =
reinterpret_cast<int32_t *
>(dst.
ptr());
630 int x = window_start_x;
631 for(; x <= (window_end_x - window_step_x); x += window_step_x)
633 const int16x8x2_t texels =
636 vld1q_s16(src_ptr + x),
637 vld1q_s16(src_ptr + x + 8)
641 const int32x4x4_t texels_s32 =
644 vmovl_s16(vget_low_s16(texels.val[0])),
645 vmovl_s16(vget_high_s16(texels.val[0])),
646 vmovl_s16(vget_low_s16(texels.val[1])),
647 vmovl_s16(vget_high_s16(texels.val[1]))
651 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
652 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
653 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
654 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
658 for(; x < window_end_x; ++x)
660 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
682 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(src.
ptr());
683 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
685 int x = window_start_x;
686 for(; x <= (window_end_x - window_step_x); x += window_step_x)
688 const uint16x8x2_t texels =
691 vld1q_u16(src_ptr + x),
692 vld1q_u16(src_ptr + x + 8)
696 vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
700 for(; x < window_end_x; ++x)
702 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
711 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(src.
ptr());
712 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
714 int x = window_start_x;
715 for(; x <= (window_end_x - window_step_x); x += window_step_x)
717 const uint16x8x2_t texels =
720 vld1q_u16(src_ptr + x),
721 vld1q_u16(src_ptr + x + 8)
725 vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
729 for(; x < window_end_x; ++x)
731 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
744 const auto src_ptr =
reinterpret_cast<const uint16_t *
>(src.
ptr());
745 const auto dst_ptr =
reinterpret_cast<uint32_t *
>(dst.
ptr());
747 int x = window_start_x;
748 for(; x <= (window_end_x - window_step_x); x += window_step_x)
750 const uint16x8x2_t texels =
753 vld1q_u16(src_ptr + x),
754 vld1q_u16(src_ptr + x + 8)
758 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
759 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
760 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
761 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
764 for(; x < window_end_x; ++x)
766 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
778 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 787 const auto src_ptr =
reinterpret_cast<const bfloat16 *
>(src.
ptr());
788 const auto dst_ptr =
reinterpret_cast<float *
>(dst.
ptr());
790 int x = window_start_x;
791 for(; x <= (window_end_x - window_step_x); x += window_step_x)
793 const uint16x8x2_t texels =
796 vld1q_u16(reinterpret_cast<uint16_t *>(src.
ptr())),
797 vld1q_u16(reinterpret_cast<uint16_t *>(src.
ptr()) + 8)
801 vst1q_f32(reinterpret_cast<float *>(dst.
ptr()),
802 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
803 vst1q_f32(reinterpret_cast<float *>(dst.
ptr()) + 4,
804 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
805 vst1q_f32(reinterpret_cast<float *>(dst.
ptr()) + 8,
806 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
807 vst1q_f32(reinterpret_cast<float *>(dst.
ptr()) + 12,
808 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
811 for(; x < window_end_x; ++x)
813 *(dst_ptr + x) =
float(*(src_ptr + x));
824 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 833 const auto src_ptr =
reinterpret_cast<const float16_t *
>(src.
ptr());
834 const auto dst_ptr =
reinterpret_cast<int8_t *
>(dst.
ptr());
836 int x = window_start_x;
837 for(; x <= (window_end_x - window_step_x); x += window_step_x)
839 const float16x8x2_t texels =
842 vld1q_f16(src_ptr + x),
843 vld1q_f16(src_ptr + x + 8),
851 for(; x < window_end_x; ++x)
853 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
865 const auto src_ptr =
reinterpret_cast<const float16_t *
>(src.
ptr());
866 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
868 int x = window_start_x;
869 for(; x <= (window_end_x - window_step_x); x += window_step_x)
871 const float16x8x2_t texels =
874 vld1q_f16(src_ptr + x),
875 vld1q_f16(src_ptr + x + 8),
883 for(; x < window_end_x; ++x)
885 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
897 const auto src_ptr =
reinterpret_cast<const float16_t *
>(src.
ptr());
898 const auto dst_ptr =
reinterpret_cast<float *
>(dst.
ptr());
900 int x = window_start_x;
901 for(; x <= (window_end_x - window_step_x); x += window_step_x)
903 const float16x8x2_t texels =
906 vld1q_f16(src_ptr + x),
907 vld1q_f16(src_ptr + x + 8)
910 vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
911 vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
912 vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
913 vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
917 for(; x < window_end_x; ++x)
919 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
930 const auto src_ptr =
reinterpret_cast<const float16_t *
>(src.
ptr());
931 const auto dst_ptr =
reinterpret_cast<int32_t *
>(dst.
ptr());
933 int x = window_start_x;
934 for(; x <= (window_end_x - window_step_x); x += window_step_x)
936 const float16x8x2_t texels =
939 vld1q_f16(src_ptr + x),
940 vld1q_f16(src_ptr + x + 8)
944 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
945 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
946 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
947 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
951 for(; x < window_end_x; ++x)
953 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
967 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 973 const auto src_ptr =
reinterpret_cast<const float *
>(src.
ptr());
974 const auto dst_ptr =
reinterpret_cast<float16_t *
>(dst.
ptr());
976 int x = window_start_x;
977 for(; x <= (window_end_x - window_step_x); x += window_step_x)
979 const float32x4x4_t texels =
982 vld1q_f32(src_ptr + x),
983 vld1q_f32(src_ptr + x + 4),
984 vld1q_f32(src_ptr + x + 8),
985 vld1q_f32(src_ptr + x + 12)
989 vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
990 vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
994 for(; x < window_end_x; ++x)
996 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
1003 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 1009 const auto src_ptr =
reinterpret_cast<const float *
>(src.
ptr());
1010 const auto dst_ptr =
reinterpret_cast<bfloat16 *
>(dst.
ptr());
1012 int x = window_start_x;
1013 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1015 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.
ptr()),
1016 reinterpret_cast<uint16_t *>(dst.
ptr()));
1017 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.
ptr()) + 8,
1018 reinterpret_cast<uint16_t *>(dst.
ptr()) + 8);
1021 for(; x < window_end_x; ++x)
1023 *(dst_ptr + x) = *(src_ptr + x);
1035 const auto src_ptr =
reinterpret_cast<const float *
>(src.
ptr());
1036 const auto dst_ptr =
reinterpret_cast<int32_t *
>(dst.
ptr());
1038 int x = window_start_x;
1039 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1041 const float32x4x4_t texels =
1044 vld1q_f32(src_ptr + x),
1045 vld1q_f32(src_ptr + x + 4),
1046 vld1q_f32(src_ptr + x + 8),
1047 vld1q_f32(src_ptr + x + 12),
1051 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
1052 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1053 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1054 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1058 for(; x < window_end_x; ++x)
1060 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
1072 const auto src_ptr =
reinterpret_cast<const float *
>(src.
ptr());
1073 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
1075 int x = window_start_x;
1076 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1078 const float32x4x4_t texels =
1081 vld1q_f32(src_ptr + x),
1082 vld1q_f32(src_ptr + x + 4),
1083 vld1q_f32(src_ptr + x + 8),
1084 vld1q_f32(src_ptr + x + 12),
1088 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1089 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1093 for(; x < window_end_x; ++x)
1095 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1106 const auto src_ptr =
reinterpret_cast<const float *
>(src.
ptr());
1107 const auto dst_ptr =
reinterpret_cast<int8_t *
>(dst.
ptr());
1109 int x = window_start_x;
1110 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1112 const float32x4x4_t texels =
1115 vld1q_f32(src_ptr + x),
1116 vld1q_f32(src_ptr + x + 4),
1117 vld1q_f32(src_ptr + x + 8),
1118 vld1q_f32(src_ptr + x + 12),
1122 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1123 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1126 for(; x < window_end_x; ++x)
1128 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1143 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1149 const auto src_ptr =
reinterpret_cast<const int32_t *
>(src.
ptr());
1150 const auto dst_ptr =
reinterpret_cast<float16_t *
>(dst.
ptr());
1152 int x = window_start_x;
1153 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1155 const float32x4x4_t texels =
1158 vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
1159 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
1160 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
1161 vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
1165 vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1166 vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1170 for(; x < window_end_x; ++x)
1172 *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
1184 const auto src_ptr =
reinterpret_cast<const int32_t *
>(src.
ptr());
1185 const auto dst_ptr =
reinterpret_cast<float *
>(dst.
ptr());
1187 int x = window_start_x;
1188 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1190 const int32x4x4_t texels =
1193 vld1q_s32(src_ptr + x),
1194 vld1q_s32(src_ptr + x + 4),
1195 vld1q_s32(src_ptr + x + 8),
1196 vld1q_s32(src_ptr + x + 12),
1200 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
1201 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1202 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1203 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1207 for(; x < window_end_x; ++x)
1209 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
1222 const auto src_ptr =
reinterpret_cast<const int32_t *
>(src.
ptr());
1223 const auto dst_ptr =
reinterpret_cast<int8_t *
>(dst.
ptr());
1225 int x = window_start_x;
1226 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1228 const int32x4x4_t texels =
1231 vld1q_s32(src_ptr + x),
1232 vld1q_s32(src_ptr + x + 4),
1233 vld1q_s32(src_ptr + x + 8),
1234 vld1q_s32(src_ptr + x + 12),
1237 vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1238 vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1242 for(; x < window_end_x; ++x)
1244 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1253 const auto src_ptr =
reinterpret_cast<const int32_t *
>(src.
ptr());
1254 const auto dst_ptr =
reinterpret_cast<int8_t *
>(dst.
ptr());
1256 int x = window_start_x;
1257 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1259 const int32x4x4_t texels =
1262 vld1q_s32(src_ptr + x),
1263 vld1q_s32(src_ptr + x + 4),
1264 vld1q_s32(src_ptr + x + 8),
1265 vld1q_s32(src_ptr + x + 12)
1269 vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1270 vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1274 for(; x < window_end_x; ++x)
1276 *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1291 const auto src_ptr =
reinterpret_cast<const int32_t *
>(src.
ptr());
1292 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
1294 int x = window_start_x;
1295 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1297 const int32x4x4_t texels =
1300 vld1q_s32(src_ptr + x),
1301 vld1q_s32(src_ptr + x + 4),
1302 vld1q_s32(src_ptr + x + 8),
1303 vld1q_s32(src_ptr + x + 12)
1306 vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1307 vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1311 for(; x < window_end_x; ++x)
1313 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1322 const auto src_ptr =
reinterpret_cast<const int32_t *
>(src.
ptr());
1323 const auto dst_ptr =
reinterpret_cast<uint8_t *
>(dst.
ptr());
1325 int x = window_start_x;
1326 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1328 const int32x4x4_t texels =
1331 vld1q_s32(src_ptr + x),
1332 vld1q_s32(src_ptr + x + 4),
1333 vld1q_s32(src_ptr + x + 8),
1334 vld1q_s32(src_ptr + x + 12)
1338 vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1339 vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1343 for(; x < window_end_x; ++x)
1345 *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1363 return "CpuCastKernel.cpp";
void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
Set the src and dst of the kernel.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
uint16x8_t vcvtq_f16_s16(float16x8_t)
const Window & window() const
The maximum window the kernel can be executed on.
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
Brain floating point representation class.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
1 channel, 1 U16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for CPU tensor.
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
16-bit brain floating-point number
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
int16x8_t vcvtq_s16_f16(float16x8_t)
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
ConvertPolicy
Policy to handle overflow.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.