Execute the kernel on the passed window.
144 const auto window_start_x = static_cast<int>(
window.
x().
start());
145 const auto window_end_x = static_cast<int>(
window.
x().
end());
146 const int window_step_x = 16;
158 const int16x8_t
b = vdupq_n_s16(_shift);
167 const auto input_ptr = reinterpret_cast<const int8_t *>(
input.ptr());
168 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
169 int x = window_start_x;
171 for(; x <= (window_end_x - window_step_x); x += window_step_x)
173 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
175 const int16x8x2_t texels =
178 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)),
b),
179 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)),
b)
183 vst1q_s16(output_ptr + x, texels.val[0]);
184 vst1q_s16(output_ptr + x + 8, texels.val[1]);
188 for(; x < window_end_x; ++x)
190 *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
201 const auto input_ptr = reinterpret_cast<const int8_t *>(
input.ptr());
202 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
203 int x = window_start_x;
205 for(; x <= (window_end_x - window_step_x); x += window_step_x)
207 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
209 const int16x8x2_t texels =
212 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)),
b),
213 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)),
b)
217 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
218 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
219 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
220 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
224 for(; x < window_end_x; ++x)
226 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
237 const auto input_ptr = reinterpret_cast<const int8_t *>(
input.ptr());
238 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
240 int x = window_start_x;
241 for(; x <= (window_end_x - window_step_x); x += window_step_x)
243 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(
input.ptr()));
245 const int16x8x2_t texels =
248 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)),
b),
249 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)),
b)
252 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
253 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
254 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
255 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
259 for(; x < window_end_x; ++x)
261 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
267 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 273 const auto input_ptr = reinterpret_cast<const int8_t *>(
input.ptr());
274 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
275 int x = window_start_x;
277 for(; x <= (window_end_x - window_step_x); x += window_step_x)
279 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
281 const int16x8x2_t texels =
284 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)),
b),
285 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)),
b)
293 for(; x < window_end_x; ++x)
295 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
301 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 312 const int16x8_t
b = vdupq_n_s16(_shift);
321 const auto input_ptr = reinterpret_cast<const uint8_t *>(
input.ptr());
322 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
324 int x = window_start_x;
325 for(; x <= (window_end_x - window_step_x); x += window_step_x)
327 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
329 const int16x8x2_t texels =
332 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
b),
333 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))),
b)
337 vst1q_s16(output_ptr + x, texels.val[0]);
338 vst1q_s16(output_ptr + x + 8, texels.val[1]);
342 for(; x < window_end_x; ++x)
344 auto in = static_cast<int32_t>(*(input_ptr + x));
345 *(output_ptr + x) = in << _shift;
356 const auto input_ptr = reinterpret_cast<const uint8_t *>(
input.ptr());
357 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
359 int x = window_start_x;
360 for(; x <= (window_end_x - window_step_x); x += window_step_x)
362 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
364 const int16x8x2_t texels =
367 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
b),
368 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))),
b)
372 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
373 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
374 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
375 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
379 for(; x < window_end_x; ++x)
381 auto in = static_cast<uint32_t>(*(input_ptr + x));
382 *(output_ptr + x) = in << _shift;
393 const auto input_ptr = reinterpret_cast<const uint8_t *>(
input.ptr());
394 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
396 int x = window_start_x;
397 for(; x <= (window_end_x - window_step_x); x += window_step_x)
399 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
401 const int16x8x2_t texels =
404 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
b),
405 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))),
b)
408 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
409 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
410 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
411 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
415 for(; x < window_end_x; ++x)
417 auto in = static_cast<uint32_t>(*(input_ptr + x));
418 *(output_ptr + x) = static_cast<float>(in << _shift);
424 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 430 const auto input_ptr = reinterpret_cast<const uint8_t *>(
input.ptr());
431 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
433 int x = window_start_x;
434 for(; x <= (window_end_x - window_step_x); x += window_step_x)
436 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
438 const int16x8x2_t texels =
441 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
b),
442 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))),
b)
450 for(; x < window_end_x; ++x)
452 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
458 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 464 const auto input_ptr = reinterpret_cast<const uint8_t *>(
input.ptr());
465 const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
467 int x = window_start_x;
468 for(; x <= (window_end_x - window_step_x); x += window_step_x)
470 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
472 const uint16x8x2_t texels =
475 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)),
b),
476 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)),
b)
480 vst1q_u16(output_ptr + x, texels.val[0]);
481 vst1q_u16(output_ptr + x + 8, texels.val[1]);
485 for(; x < window_end_x; ++x)
487 *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
504 const int16x8_t
b = vdupq_n_s16(-static_cast<int16_t>(_shift));
511 const auto input_ptr = reinterpret_cast<const int16_t *>(
input.ptr());
512 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
514 int x = window_start_x;
515 for(; x <= (window_end_x - window_step_x); x += window_step_x)
517 const int16x8x2_t texels =
520 vqshlq_s16(vld1q_s16(input_ptr + x),
b),
521 vqshlq_s16(vld1q_s16(input_ptr + x + 8),
b)
525 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
529 for(; x < window_end_x; ++x)
531 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
540 const auto input_ptr = reinterpret_cast<const int16_t *>(
input.ptr());
541 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
543 int x = window_start_x;
544 for(; x <= (window_end_x - window_step_x); x += window_step_x)
546 const int16x8x2_t texels =
549 vshlq_s16(vld1q_s16(input_ptr + x),
b),
550 vshlq_s16(vld1q_s16(input_ptr + x + 8),
b)
554 vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
558 for(; x < window_end_x; ++x)
560 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
569 const int16x8_t
b = vdupq_n_s16(-static_cast<int16_t>(_shift));
576 const auto input_ptr = reinterpret_cast<const int16_t *>(
input.ptr());
577 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
579 int x = window_start_x;
580 for(; x <= (window_end_x - window_step_x); x += window_step_x)
582 const int16x8x2_t texels =
585 vqshlq_s16(vld1q_s16(input_ptr + x),
b),
586 vqshlq_s16(vld1q_s16(input_ptr + x + 8),
b)
590 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
594 for(; x < window_end_x; ++x)
596 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
605 const auto input_ptr = reinterpret_cast<const int16_t *>(
input.ptr());
606 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
608 int x = window_start_x;
609 for(; x <= (window_end_x - window_step_x); x += window_step_x)
611 const int16x8x2_t texels =
614 vshlq_s16(vld1q_s16(input_ptr + x),
b),
615 vshlq_s16(vld1q_s16(input_ptr + x + 8),
b)
619 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
620 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
624 for(; x < window_end_x; ++x)
626 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
635 const int32x4_t
b = vdupq_n_s32(_shift);
640 const auto input_ptr = reinterpret_cast<const int16_t *>(
input.ptr());
641 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
643 int x = window_start_x;
644 for(; x <= (window_end_x - window_step_x); x += window_step_x)
646 const int16x8x2_t texels =
649 vld1q_s16(input_ptr + x),
650 vld1q_s16(input_ptr + x + 8)
654 const int32x4x4_t texels_s32 =
657 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])),
b),
658 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])),
b),
659 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])),
b),
660 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])),
b)
664 vst1q_s32(output_ptr + x, texels_s32.val[0]);
665 vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
666 vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
667 vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
671 for(; x < window_end_x; ++x)
673 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
690 const int16x8_t
b = vdupq_n_s16(-static_cast<int16_t>(_shift));
697 const auto input_ptr = reinterpret_cast<const uint16_t *>(
input.ptr());
698 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
700 int x = window_start_x;
701 for(; x <= (window_end_x - window_step_x); x += window_step_x)
703 const uint16x8x2_t texels =
706 vqshlq_u16(vld1q_u16(input_ptr + x),
b),
707 vqshlq_u16(vld1q_u16(input_ptr + x + 8),
b)
711 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
715 for(; x < window_end_x; ++x)
717 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
726 const auto input_ptr = reinterpret_cast<const uint16_t *>(
input.ptr());
727 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
729 int x = window_start_x;
730 for(; x <= (window_end_x - window_step_x); x += window_step_x)
732 const uint16x8x2_t texels =
735 vshlq_u16(vld1q_u16(input_ptr + x),
b),
736 vshlq_u16(vld1q_u16(input_ptr + x + 8),
b)
740 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
744 for(; x < window_end_x; ++x)
746 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
756 const int32x4_t
b = vdupq_n_s32(_shift);
761 const auto input_ptr = reinterpret_cast<const uint16_t *>(
input.ptr());
762 const auto output_ptr = reinterpret_cast<uint32_t *>(output.ptr());
764 int x = window_start_x;
765 for(; x <= (window_end_x - window_step_x); x += window_step_x)
767 const uint16x8x2_t texels =
770 vld1q_u16(input_ptr + x),
771 vld1q_u16(input_ptr + x + 8)
775 vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])),
b));
776 vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])),
b));
777 vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])),
b));
778 vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])),
b));
781 for(; x < window_end_x; ++x)
783 *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
795 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 804 const auto input_ptr = reinterpret_cast<const bfloat16 *>(
input.ptr());
805 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
807 int x = window_start_x;
808 for(; x <= (window_end_x - window_step_x); x += window_step_x)
810 const uint16x8x2_t texels =
813 vld1q_u16(reinterpret_cast<uint16_t *>(
input.ptr())),
814 vld1q_u16(reinterpret_cast<uint16_t *>(
input.ptr()) + 8)
818 vst1q_f32(reinterpret_cast<float *>(output.ptr()),
819 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
820 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4,
821 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
822 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8,
823 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
824 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12,
825 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
828 for(; x < window_end_x; ++x)
830 *(output_ptr + x) =
float(*(input_ptr + x));
841 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 847 const float16_t scale_s = 1 << _shift;
848 const float16x8_t
scale = vdupq_n_f16(scale_s);
853 const auto input_ptr = reinterpret_cast<const float16_t *>(
input.ptr());
854 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
856 int x = window_start_x;
857 for(; x <= (window_end_x - window_step_x); x += window_step_x)
859 const float16x8x2_t texels =
871 for(; x < window_end_x; ++x)
873 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
882 const float16_t scale_s = 1 << _shift;
883 const float16x8_t
scale = vdupq_n_f16(scale_s);
888 const auto input_ptr = reinterpret_cast<const float16_t *>(
input.ptr());
889 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
891 int x = window_start_x;
892 for(; x <= (window_end_x - window_step_x); x += window_step_x)
894 const float16x8x2_t texels =
906 for(; x < window_end_x; ++x)
908 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
917 const float scale_s = 1 << _shift;
918 const float32x4_t
scale = vdupq_n_f32(scale_s);
923 const auto input_ptr = reinterpret_cast<const float16_t *>(
input.ptr());
924 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
926 int x = window_start_x;
927 for(; x <= (window_end_x - window_step_x); x += window_step_x)
929 const float16x8x2_t texels =
932 vld1q_f16(input_ptr + x),
933 vld1q_f16(input_ptr + x + 8)
936 vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])),
scale));
937 vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])),
scale));
938 vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])),
scale));
939 vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])),
scale));
943 for(; x < window_end_x; ++x)
945 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
953 const float scale_s = 1 << _shift;
954 const float32x4_t
scale = vdupq_n_f32(scale_s);
959 const auto input_ptr = reinterpret_cast<const float16_t *>(
input.ptr());
960 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
962 int x = window_start_x;
963 for(; x <= (window_end_x - window_step_x); x += window_step_x)
965 const float16x8x2_t texels =
968 vld1q_f16(input_ptr + x),
969 vld1q_f16(input_ptr + x + 8)
973 vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])),
scale)));
974 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])),
scale)));
975 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])),
scale)));
976 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])),
scale)));
980 for(; x < window_end_x; ++x)
982 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
996 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 999 const float scale_s = 1.f / (1 << _shift);
1000 const float32x4_t
scale = vdupq_n_f32(scale_s);
1005 const auto input_ptr = reinterpret_cast<const float *>(
input.ptr());
1006 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1008 int x = window_start_x;
1009 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1011 const float32x4x4_t texels =
1014 vmulq_f32(vld1q_f32(input_ptr + x),
scale),
1015 vmulq_f32(vld1q_f32(input_ptr + x + 4),
scale),
1016 vmulq_f32(vld1q_f32(input_ptr + x + 8),
scale),
1017 vmulq_f32(vld1q_f32(input_ptr + x + 12),
scale)
1021 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1022 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1026 for(; x < window_end_x; ++x)
1028 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1035 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 1041 const auto input_ptr = reinterpret_cast<const float *>(
input.ptr());
1042 const auto output_ptr = reinterpret_cast<bfloat16 *>(output.ptr());
1044 int x = window_start_x;
1045 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1047 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(
input.ptr()),
1048 reinterpret_cast<uint16_t *>(output.ptr()));
1049 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(
input.ptr()) + 8,
1050 reinterpret_cast<uint16_t *>(output.ptr()) + 8);
1053 for(; x < window_end_x; ++x)
1055 *(output_ptr + x) = *(input_ptr + x);
1064 const float scale_s = 1.f / (1 << _shift);
1065 const float32x4_t
scale = vdupq_n_f32(scale_s);
1070 const auto input_ptr = reinterpret_cast<const float *>(
input.ptr());
1071 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
1073 int x = window_start_x;
1074 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1076 const float32x4x4_t texels =
1079 vmulq_f32(vld1q_f32(input_ptr + x),
scale),
1080 vmulq_f32(vld1q_f32(input_ptr + x + 4),
scale),
1081 vmulq_f32(vld1q_f32(input_ptr + x + 8),
scale),
1082 vmulq_f32(vld1q_f32(input_ptr + x + 12),
scale),
1086 vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1087 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1088 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1089 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1093 for(; x < window_end_x; ++x)
1095 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1104 const float scale_s = 1.f / (1 << _shift);
1105 const float32x4_t
scale = vdupq_n_f32(scale_s);
1110 const auto input_ptr = reinterpret_cast<const float *>(
input.ptr());
1111 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1113 int x = window_start_x;
1114 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1116 const float32x4x4_t texels =
1119 vmulq_f32(vld1q_f32(input_ptr + x),
scale),
1120 vmulq_f32(vld1q_f32(input_ptr + x + 4),
scale),
1121 vmulq_f32(vld1q_f32(input_ptr + x + 8),
scale),
1122 vmulq_f32(vld1q_f32(input_ptr + x + 12),
scale),
1126 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1127 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1131 for(; x < window_end_x; ++x)
1133 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1141 const float scale_s = 1.f / (1 << _shift);
1142 const float32x4_t
scale = vdupq_n_f32(scale_s);
1147 const auto input_ptr = reinterpret_cast<const float *>(
input.ptr());
1148 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1150 int x = window_start_x;
1151 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1153 const float32x4x4_t texels =
1156 vmulq_f32(vld1q_f32(input_ptr + x),
scale),
1157 vmulq_f32(vld1q_f32(input_ptr + x + 4),
scale),
1158 vmulq_f32(vld1q_f32(input_ptr + x + 8),
scale),
1159 vmulq_f32(vld1q_f32(input_ptr + x + 12),
scale),
1163 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1164 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1167 for(; x < window_end_x; ++x)
1169 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1184 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1187 const float scale_s = 1.f / (1 << _shift);
1188 const float32x4_t
scale = vdupq_n_f32(scale_s);
1193 const auto input_ptr = reinterpret_cast<const int32_t *>(
input.ptr());
1194 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1196 int x = window_start_x;
1197 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1199 const float32x4x4_t texels =
1202 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)),
scale),
1203 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)),
scale),
1204 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)),
scale),
1205 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)),
scale)
1209 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1210 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1214 for(; x < window_end_x; ++x)
1216 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1225 const int scale_s = 1.f / (1 << _shift);
1226 const int32x4_t
scale = vdupq_n_s32(scale_s);
1231 const auto input_ptr = reinterpret_cast<const int32_t *>(
input.ptr());
1232 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
1234 int x = window_start_x;
1235 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1237 const int32x4x4_t texels =
1240 vmulq_s32(vld1q_s32(input_ptr + x),
scale),
1241 vmulq_s32(vld1q_s32(input_ptr + x + 4),
scale),
1242 vmulq_s32(vld1q_s32(input_ptr + x + 8),
scale),
1243 vmulq_s32(vld1q_s32(input_ptr + x + 12),
scale),
1247 vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1248 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1249 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1250 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1254 for(; x < window_end_x; ++x)
1256 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1264 const int32x4_t
b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1271 const auto input_ptr = reinterpret_cast<const int32_t *>(
input.ptr());
1272 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1274 int x = window_start_x;
1275 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1277 const int32x4x4_t texels =
1280 vqshlq_s32(vld1q_s32(input_ptr + x),
b),
1281 vqshlq_s32(vld1q_s32(input_ptr + x + 4),
b),
1282 vqshlq_s32(vld1q_s32(input_ptr + x + 8),
b),
1283 vqshlq_s32(vld1q_s32(input_ptr + x + 12),
b)
1286 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1287 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1291 for(; x < window_end_x; ++x)
1293 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1302 const auto input_ptr = reinterpret_cast<const int32_t *>(
input.ptr());
1303 const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1305 int x = window_start_x;
1306 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1308 const int32x4x4_t texels =
1311 vshlq_s32(vld1q_s32(input_ptr + x),
b),
1312 vshlq_s32(vld1q_s32(input_ptr + x + 4),
b),
1313 vshlq_s32(vld1q_s32(input_ptr + x + 8),
b),
1314 vshlq_s32(vld1q_s32(input_ptr + x + 12),
b)
1318 vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1319 vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1323 for(; x < window_end_x; ++x)
1325 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1335 const int32x4_t
b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1342 const auto input_ptr = reinterpret_cast<const int32_t *>(
input.ptr());
1343 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1345 int x = window_start_x;
1346 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1348 const int32x4x4_t texels =
1351 vqshlq_s32(vld1q_s32(input_ptr + x),
b),
1352 vqshlq_s32(vld1q_s32(input_ptr + x + 4),
b),
1353 vqshlq_s32(vld1q_s32(input_ptr + x + 8),
b),
1354 vqshlq_s32(vld1q_s32(input_ptr + x + 12),
b)
1357 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1358 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1362 for(; x < window_end_x; ++x)
1364 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1373 const auto input_ptr = reinterpret_cast<const int32_t *>(
input.ptr());
1374 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1376 int x = window_start_x;
1377 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1379 const int32x4x4_t texels =
1382 vshlq_s32(vld1q_s32(input_ptr + x),
b),
1383 vshlq_s32(vld1q_s32(input_ptr + x + 4),
b),
1384 vshlq_s32(vld1q_s32(input_ptr + x + 8),
b),
1385 vshlq_s32(vld1q_s32(input_ptr + x + 12),
b)
1389 vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1390 vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1394 for(; x < window_end_x; ++x)
1396 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
uint16x8_t vcvtq_f16_s16(float16x8_t)
const Window & window() const
The maximum window the kernel can be executed on.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Describe one of the image's dimensions with a start, end and step.
1 channel, 1 U16 per channel
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
16-bit brain floating-point number
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
quantized, asymmetric fixed-point 8-bit number unsigned
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
int16x8_t vcvtq_s16_f16(float16x8_t)
quantized, asymmetric fixed-point 8-bit number signed
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.