Execute the kernel on the passed window.
147 const auto window_start_x =
static_cast<int>(window.
x().
start());
148 const auto window_end_x =
static_cast<int>(window.
x().
end());
149 const int window_step_x = 16;
161 const int16x8_t
b = vdupq_n_s16(_shift);
170 const auto input_ptr =
reinterpret_cast<const int8_t *
>(
input.ptr());
171 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.ptr());
172 int x = window_start_x;
174 for(; x <= (window_end_x - window_step_x); x += window_step_x)
176 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
178 const int16x8x2_t texels =
181 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
182 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
186 vst1q_s16(output_ptr + x, texels.val[0]);
187 vst1q_s16(output_ptr + x + 8, texels.val[1]);
191 for(; x < window_end_x; ++x)
193 *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
204 const auto input_ptr =
reinterpret_cast<const int8_t *
>(
input.ptr());
205 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
206 int x = window_start_x;
208 for(; x <= (window_end_x - window_step_x); x += window_step_x)
210 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
212 const int16x8x2_t texels =
215 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
216 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
220 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
221 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
222 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
223 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
227 for(; x < window_end_x; ++x)
229 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
240 const auto input_ptr =
reinterpret_cast<const int8_t *
>(
input.ptr());
241 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
243 int x = window_start_x;
244 for(; x <= (window_end_x - window_step_x); x += window_step_x)
246 const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(
input.ptr()));
248 const int16x8x2_t texels =
251 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
252 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
255 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
256 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
257 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
258 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
262 for(; x < window_end_x; ++x)
264 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
270 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 276 const auto input_ptr =
reinterpret_cast<const int8_t *
>(
input.ptr());
277 const auto output_ptr =
reinterpret_cast<float16_t *
>(output.ptr());
278 int x = window_start_x;
280 for(; x <= (window_end_x - window_step_x); x += window_step_x)
282 const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
284 const int16x8x2_t texels =
287 vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
288 vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
296 for(; x < window_end_x; ++x)
298 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
304 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 315 const int16x8_t b = vdupq_n_s16(_shift);
324 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
325 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.ptr());
327 int x = window_start_x;
328 for(; x <= (window_end_x - window_step_x); x += window_step_x)
330 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
332 const int16x8x2_t texels =
335 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
336 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
340 vst1q_s16(output_ptr + x, texels.val[0]);
341 vst1q_s16(output_ptr + x + 8, texels.val[1]);
345 for(; x < window_end_x; ++x)
347 auto in =
static_cast<int32_t
>(*(input_ptr + x));
348 *(output_ptr + x) = in << _shift;
359 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
360 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
362 int x = window_start_x;
363 for(; x <= (window_end_x - window_step_x); x += window_step_x)
365 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
367 const int16x8x2_t texels =
370 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
371 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
375 vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
376 vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
377 vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
378 vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
382 for(; x < window_end_x; ++x)
384 auto in =
static_cast<uint32_t
>(*(input_ptr + x));
385 *(output_ptr + x) = in << _shift;
396 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
397 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
399 int x = window_start_x;
400 for(; x <= (window_end_x - window_step_x); x += window_step_x)
402 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
404 const int16x8x2_t texels =
407 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
408 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
411 vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
412 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
413 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
414 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
418 for(; x < window_end_x; ++x)
420 auto in =
static_cast<uint32_t
>(*(input_ptr + x));
421 *(output_ptr + x) = static_cast<float>(in << _shift);
427 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 433 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
434 const auto output_ptr =
reinterpret_cast<float16_t *
>(output.ptr());
436 int x = window_start_x;
437 for(; x <= (window_end_x - window_step_x); x += window_step_x)
439 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
441 const int16x8x2_t texels =
444 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
445 vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
453 for(; x < window_end_x; ++x)
455 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
461 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 467 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
468 const auto output_ptr =
reinterpret_cast<uint16_t *
>(output.ptr());
470 int x = window_start_x;
471 for(; x <= (window_end_x - window_step_x); x += window_step_x)
473 const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
475 const uint16x8x2_t texels =
478 vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
479 vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
483 vst1q_u16(output_ptr + x, texels.val[0]);
484 vst1q_u16(output_ptr + x + 8, texels.val[1]);
488 for(; x < window_end_x; ++x)
490 *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
507 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
514 const auto input_ptr =
reinterpret_cast<const int16_t *
>(
input.ptr());
515 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.ptr());
517 int x = window_start_x;
518 for(; x <= (window_end_x - window_step_x); x += window_step_x)
520 const int16x8x2_t texels =
523 vqshlq_s16(vld1q_s16(input_ptr + x), b),
524 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
528 vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
532 for(; x < window_end_x; ++x)
534 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
543 const auto input_ptr =
reinterpret_cast<const int16_t *
>(
input.ptr());
544 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.ptr());
546 int x = window_start_x;
547 for(; x <= (window_end_x - window_step_x); x += window_step_x)
549 const int16x8x2_t texels =
552 vshlq_s16(vld1q_s16(input_ptr + x), b),
553 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
557 vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
561 for(; x < window_end_x; ++x)
563 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
572 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
579 const auto input_ptr =
reinterpret_cast<const int16_t *
>(
input.ptr());
580 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
582 int x = window_start_x;
583 for(; x <= (window_end_x - window_step_x); x += window_step_x)
585 const int16x8x2_t texels =
588 vqshlq_s16(vld1q_s16(input_ptr + x), b),
589 vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
593 vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
597 for(; x < window_end_x; ++x)
599 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
608 const auto input_ptr =
reinterpret_cast<const int16_t *
>(
input.ptr());
609 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
611 int x = window_start_x;
612 for(; x <= (window_end_x - window_step_x); x += window_step_x)
614 const int16x8x2_t texels =
617 vshlq_s16(vld1q_s16(input_ptr + x), b),
618 vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
622 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
623 vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
627 for(; x < window_end_x; ++x)
629 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
638 const int32x4_t b = vdupq_n_s32(_shift);
643 const auto input_ptr =
reinterpret_cast<const int16_t *
>(
input.ptr());
644 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
646 int x = window_start_x;
647 for(; x <= (window_end_x - window_step_x); x += window_step_x)
649 const int16x8x2_t texels =
652 vld1q_s16(input_ptr + x),
653 vld1q_s16(input_ptr + x + 8)
657 const int32x4x4_t texels_s32 =
660 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
661 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
662 vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
663 vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
667 vst1q_s32(output_ptr + x, texels_s32.val[0]);
668 vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
669 vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
670 vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
674 for(; x < window_end_x; ++x)
676 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
693 const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
700 const auto input_ptr =
reinterpret_cast<const uint16_t *
>(
input.ptr());
701 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
703 int x = window_start_x;
704 for(; x <= (window_end_x - window_step_x); x += window_step_x)
706 const uint16x8x2_t texels =
709 vqshlq_u16(vld1q_u16(input_ptr + x), b),
710 vqshlq_u16(vld1q_u16(input_ptr + x + 8), b)
714 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
718 for(; x < window_end_x; ++x)
720 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
729 const auto input_ptr =
reinterpret_cast<const uint16_t *
>(
input.ptr());
730 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
732 int x = window_start_x;
733 for(; x <= (window_end_x - window_step_x); x += window_step_x)
735 const uint16x8x2_t texels =
738 vshlq_u16(vld1q_u16(input_ptr + x), b),
739 vshlq_u16(vld1q_u16(input_ptr + x + 8), b)
743 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
747 for(; x < window_end_x; ++x)
749 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
759 const int32x4_t b = vdupq_n_s32(_shift);
764 const auto input_ptr =
reinterpret_cast<const uint16_t *
>(
input.ptr());
765 const auto output_ptr =
reinterpret_cast<uint32_t *
>(output.ptr());
767 int x = window_start_x;
768 for(; x <= (window_end_x - window_step_x); x += window_step_x)
770 const uint16x8x2_t texels =
773 vld1q_u16(input_ptr + x),
774 vld1q_u16(input_ptr + x + 8)
778 vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
779 vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
780 vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
781 vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
784 for(; x < window_end_x; ++x)
786 *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
798 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 807 const auto input_ptr =
reinterpret_cast<const bfloat16 *
>(
input.ptr());
808 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
810 int x = window_start_x;
811 for(; x <= (window_end_x - window_step_x); x += window_step_x)
813 const uint16x8x2_t texels =
816 vld1q_u16(reinterpret_cast<uint16_t *>(
input.ptr())),
817 vld1q_u16(reinterpret_cast<uint16_t *>(
input.ptr()) + 8)
821 vst1q_f32(reinterpret_cast<float *>(output.ptr()),
822 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
823 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4,
824 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
825 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8,
826 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
827 vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12,
828 vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
831 for(; x < window_end_x; ++x)
833 *(output_ptr + x) =
float(*(input_ptr + x));
844 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 850 const float16_t scale_s = 1 << _shift;
851 const float16x8_t
scale = vdupq_n_f16(scale_s);
856 const auto input_ptr =
reinterpret_cast<const float16_t *
>(
input.ptr());
857 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.ptr());
859 int x = window_start_x;
860 for(; x <= (window_end_x - window_step_x); x += window_step_x)
862 const float16x8x2_t texels =
865 vmulq_f16(vld1q_f16(input_ptr + x), scale),
866 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
874 for(; x < window_end_x; ++x)
876 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
885 const float16_t scale_s = 1 << _shift;
886 const float16x8_t scale = vdupq_n_f16(scale_s);
891 const auto input_ptr =
reinterpret_cast<const float16_t *
>(
input.ptr());
892 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
894 int x = window_start_x;
895 for(; x <= (window_end_x - window_step_x); x += window_step_x)
897 const float16x8x2_t texels =
900 vmulq_f16(vld1q_f16(input_ptr + x), scale),
901 vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
909 for(; x < window_end_x; ++x)
911 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
920 const float scale_s = 1 << _shift;
921 const float32x4_t scale = vdupq_n_f32(scale_s);
926 const auto input_ptr =
reinterpret_cast<const float16_t *
>(
input.ptr());
927 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
929 int x = window_start_x;
930 for(; x <= (window_end_x - window_step_x); x += window_step_x)
932 const float16x8x2_t texels =
935 vld1q_f16(input_ptr + x),
936 vld1q_f16(input_ptr + x + 8)
939 vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
940 vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
941 vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
942 vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
946 for(; x < window_end_x; ++x)
948 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
956 const float scale_s = 1 << _shift;
957 const float32x4_t scale = vdupq_n_f32(scale_s);
962 const auto input_ptr =
reinterpret_cast<const float16_t *
>(
input.ptr());
963 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
965 int x = window_start_x;
966 for(; x <= (window_end_x - window_step_x); x += window_step_x)
968 const float16x8x2_t texels =
971 vld1q_f16(input_ptr + x),
972 vld1q_f16(input_ptr + x + 8)
976 vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
977 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
978 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
979 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
983 for(; x < window_end_x; ++x)
985 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
999 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1002 const float scale_s = 1.f / (1 << _shift);
1003 const float32x4_t scale = vdupq_n_f32(scale_s);
1008 const auto input_ptr =
reinterpret_cast<const float *
>(
input.ptr());
1009 const auto output_ptr =
reinterpret_cast<float16_t *
>(output.ptr());
1011 int x = window_start_x;
1012 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1014 const float32x4x4_t texels =
1017 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1018 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1019 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1020 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale)
1024 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1025 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1029 for(; x < window_end_x; ++x)
1031 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1038 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 1044 const auto input_ptr =
reinterpret_cast<const float *
>(
input.ptr());
1045 const auto output_ptr =
reinterpret_cast<bfloat16 *
>(output.ptr());
1047 int x = window_start_x;
1048 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1050 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(
input.ptr()),
1051 reinterpret_cast<uint16_t *>(output.ptr()));
1052 wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(
input.ptr()) + 8,
1053 reinterpret_cast<uint16_t *>(output.ptr()) + 8);
1056 for(; x < window_end_x; ++x)
1058 *(output_ptr + x) = *(input_ptr + x);
1067 const float scale_s = 1.f / (1 << _shift);
1068 const float32x4_t scale = vdupq_n_f32(scale_s);
1073 const auto input_ptr =
reinterpret_cast<const float *
>(
input.ptr());
1074 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
1076 int x = window_start_x;
1077 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1079 const float32x4x4_t texels =
1082 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1083 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1084 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1085 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1089 vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1090 vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1091 vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1092 vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1096 for(; x < window_end_x; ++x)
1098 *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1107 const float scale_s = 1.f / (1 << _shift);
1108 const float32x4_t scale = vdupq_n_f32(scale_s);
1113 const auto input_ptr =
reinterpret_cast<const float *
>(
input.ptr());
1114 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
1116 int x = window_start_x;
1117 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1119 const float32x4x4_t texels =
1122 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1123 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1124 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1125 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1129 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1130 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1134 for(; x < window_end_x; ++x)
1136 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1144 const float scale_s = 1.f / (1 << _shift);
1145 const float32x4_t scale = vdupq_n_f32(scale_s);
1150 const auto input_ptr =
reinterpret_cast<const float *
>(
input.ptr());
1151 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.ptr());
1153 int x = window_start_x;
1154 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1156 const float32x4x4_t texels =
1159 vmulq_f32(vld1q_f32(input_ptr + x), scale),
1160 vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1161 vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1162 vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1166 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1167 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1170 for(; x < window_end_x; ++x)
1172 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1187 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1190 const float scale_s = 1.f / (1 << _shift);
1191 const float32x4_t scale = vdupq_n_f32(scale_s);
1196 const auto input_ptr =
reinterpret_cast<const int32_t *
>(
input.ptr());
1197 const auto output_ptr =
reinterpret_cast<float16_t *
>(output.ptr());
1199 int x = window_start_x;
1200 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1202 const float32x4x4_t texels =
1205 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)), scale),
1206 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)), scale),
1207 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)), scale),
1208 vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)), scale)
1212 vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1213 vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1217 for(; x < window_end_x; ++x)
1219 *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1228 const int scale_s = 1.f / (1 << _shift);
1229 const int32x4_t scale = vdupq_n_s32(scale_s);
1234 const auto input_ptr =
reinterpret_cast<const int32_t *
>(
input.ptr());
1235 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
1237 int x = window_start_x;
1238 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1240 const int32x4x4_t texels =
1243 vmulq_s32(vld1q_s32(input_ptr + x), scale),
1244 vmulq_s32(vld1q_s32(input_ptr + x + 4), scale),
1245 vmulq_s32(vld1q_s32(input_ptr + x + 8), scale),
1246 vmulq_s32(vld1q_s32(input_ptr + x + 12), scale),
1250 vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1251 vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1252 vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1253 vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1257 for(; x < window_end_x; ++x)
1259 *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1267 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1274 const auto input_ptr =
reinterpret_cast<const int32_t *
>(
input.ptr());
1275 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.ptr());
1277 int x = window_start_x;
1278 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1280 const int32x4x4_t texels =
1283 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1284 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1285 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1286 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1289 vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1290 vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1294 for(; x < window_end_x; ++x)
1296 *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1305 const auto input_ptr =
reinterpret_cast<const int32_t *
>(
input.ptr());
1306 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.ptr());
1308 int x = window_start_x;
1309 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1311 const int32x4x4_t texels =
1314 vshlq_s32(vld1q_s32(input_ptr + x), b),
1315 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1316 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1317 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1321 vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1322 vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1326 for(; x < window_end_x; ++x)
1328 *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1338 const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1345 const auto input_ptr =
reinterpret_cast<const int32_t *
>(
input.ptr());
1346 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
1348 int x = window_start_x;
1349 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1351 const int32x4x4_t texels =
1354 vqshlq_s32(vld1q_s32(input_ptr + x), b),
1355 vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1356 vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1357 vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1360 vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1361 vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1365 for(; x < window_end_x; ++x)
1367 *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1376 const auto input_ptr =
reinterpret_cast<const int32_t *
>(
input.ptr());
1377 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
1379 int x = window_start_x;
1380 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1382 const int32x4x4_t texels =
1385 vshlq_s32(vld1q_s32(input_ptr + x), b),
1386 vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1387 vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1388 vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1392 vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1393 vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1397 for(; x < window_end_x; ++x)
1399 *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
uint16x8_t vcvtq_f16_s16(float16x8_t)
const Window & window() const
The maximum window the kernel can be executed on.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
Brain floating point representation class.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Describe one of the image's dimensions with a start, end and step.
1 channel, 1 U16 per channel
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
16-bit brain floating-point number
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
quantized, asymmetric fixed-point 8-bit number unsigned
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
int16x8_t vcvtq_s16_f16(float16x8_t)
quantized, asymmetric fixed-point 8-bit number signed
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.