29 const uint8x8_t vd_low = vget_low_u8(vd);
30 const uint8x8_t vd_high = vget_high_u8(vd);
31 uint16x8_t vd_low_u16x8 = vmovl_u8(vd_low);
32 uint16x8_t vd_high_u16x8 = vmovl_u8(vd_high);
34 uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
35 uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
36 uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
37 uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
39 float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
40 float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
41 float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
42 float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
44 A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
45 B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
46 C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
47 D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
49 A_u32x4 = vcvtq_u32_f32(A_f32x4);
50 B_u32x4 = vcvtq_u32_f32(B_f32x4);
51 C_u32x4 = vcvtq_u32_f32(C_f32x4);
52 D_u32x4 = vcvtq_u32_f32(D_f32x4);
54 vd_low_u16x8 = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
55 vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
57 return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
62 const int8x8_t vd_low = vget_low_s8(vd);
63 const int8x8_t vd_high = vget_high_s8(vd);
64 int16x8_t vd_low_s16x8 = vmovl_s8(vd_low);
65 int16x8_t vd_high_s16x8 = vmovl_s8(vd_high);
67 int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8));
68 int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8));
69 int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8));
70 int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8));
72 float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4);
73 float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4);
74 float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4);
75 float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4);
77 A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
78 B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
79 C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
80 D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
82 A_s32x4 = vcvtq_s32_f32(A_f32x4);
83 B_s32x4 = vcvtq_s32_f32(B_f32x4);
84 C_s32x4 = vcvtq_s32_f32(C_f32x4);
85 D_s32x4 = vcvtq_s32_f32(D_f32x4);
87 vd_low_s16x8 = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
88 vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
90 return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8));
Copyright (c) 2017-2021 Arm Limited.
uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
Perform a multiply-accumulate on all 16 components of a QASYMM8 vector.
int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector.
int8x16_t qasymm8x16_signed_t
8 bit quantized signed asymmetric vector with 16 elements
uint8x16_t qasymm8x16_t
8 bit quantized asymmetric vector with 16 elements