29 template <RoundingPolicy round_policy>
33 const uint8x8_t vd_low = vget_low_u8(vd);
34 const uint8x8_t vd_high = vget_high_u8(vd);
35 uint16x8_t vd_low_u16x8 = vmovl_u8(vd_low);
36 uint16x8_t vd_high_u16x8 = vmovl_u8(vd_high);
38 uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
39 uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
40 uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
41 uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
43 float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
44 float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
45 float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
46 float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
48 A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
49 B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
50 C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
51 D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
56 A_u32x4 = vcvtnq_u32_f32(A_f32x4);
57 B_u32x4 = vcvtnq_u32_f32(B_f32x4);
58 C_u32x4 = vcvtnq_u32_f32(C_f32x4);
59 D_u32x4 = vcvtnq_u32_f32(D_f32x4);
63 A_u32x4 = vcvtaq_u32_f32(A_f32x4);
64 B_u32x4 = vcvtaq_u32_f32(B_f32x4);
65 C_u32x4 = vcvtaq_u32_f32(C_f32x4);
66 D_u32x4 = vcvtaq_u32_f32(D_f32x4);
70 A_u32x4 = vcvtq_u32_f32(A_f32x4);
71 B_u32x4 = vcvtq_u32_f32(B_f32x4);
72 C_u32x4 = vcvtq_u32_f32(C_f32x4);
73 D_u32x4 = vcvtq_u32_f32(D_f32x4);
75 #else // #if __aarch64__
77 A_u32x4 = vcvtq_u32_f32(A_f32x4);
78 B_u32x4 = vcvtq_u32_f32(B_f32x4);
79 C_u32x4 = vcvtq_u32_f32(C_f32x4);
80 D_u32x4 = vcvtq_u32_f32(D_f32x4);
81 #endif // #if __aarch64__
83 vd_low_u16x8 = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
84 vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
86 return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
89 template <RoundingPolicy round_policy>
93 const int8x8_t vd_low = vget_low_s8(vd);
94 const int8x8_t vd_high = vget_high_s8(vd);
95 int16x8_t vd_low_s16x8 = vmovl_s8(vd_low);
96 int16x8_t vd_high_s16x8 = vmovl_s8(vd_high);
98 int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8));
99 int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8));
100 int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8));
101 int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8));
103 float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4);
104 float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4);
105 float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4);
106 float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4);
108 A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
109 B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
110 C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
111 D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
115 A_s32x4 = vcvtnq_s32_f32(A_f32x4);
116 B_s32x4 = vcvtnq_s32_f32(B_f32x4);
117 C_s32x4 = vcvtnq_s32_f32(C_f32x4);
118 D_s32x4 = vcvtnq_s32_f32(D_f32x4);
122 A_s32x4 = vcvtaq_s32_f32(A_f32x4);
123 B_s32x4 = vcvtaq_s32_f32(B_f32x4);
124 C_s32x4 = vcvtaq_s32_f32(C_f32x4);
125 D_s32x4 = vcvtaq_s32_f32(D_f32x4);
129 A_s32x4 = vcvtq_s32_f32(A_f32x4);
130 B_s32x4 = vcvtq_s32_f32(B_f32x4);
131 C_s32x4 = vcvtq_s32_f32(C_f32x4);
132 D_s32x4 = vcvtq_s32_f32(D_f32x4);
134 #else // #if __aarch64__
136 A_s32x4 = vcvtq_s32_f32(A_f32x4);
137 B_s32x4 = vcvtq_s32_f32(B_f32x4);
138 C_s32x4 = vcvtq_s32_f32(C_f32x4);
139 D_s32x4 = vcvtq_s32_f32(D_f32x4);
140 #endif // #if __aarch64__
143 vd_low_s16x8 = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
144 vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
146 return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8));