24 #ifndef ARM_COMPUTE_SVEASYMM_H
25 #define ARM_COMPUTE_SVEASYMM_H
27 #if defined(ARM_COMPUTE_ENABLE_SVE2)
45 svuint8_t svmla_qasymm8_z(svbool_t pg, svuint8_t vd, svfloat32_t vs, svfloat32_t vo);
58 svint8_t svmla_qasymm8_signed_z(svbool_t pg, svint8_t vd, svfloat32_t vs, svfloat32_t vo);
69 inline svfloat32x4_t svdequantize_z(svbool_t pg,
const svuint8_t &qv,
float scale, int32_t
offset)
71 const auto voffset = svdup_n_s32(
offset);
72 const auto vscale = svdup_n_f32(
scale);
73 const svfloat32x4_t vdequantized_input = svcreate4_f32(
75 svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)),
78 svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)),
81 svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)),
84 svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)),
86 return vdequantized_input;
97 inline svfloat32x4_t svdequantize_z(svbool_t pg,
const svuint8_t &qv,
const UniformQuantizationInfo &qi)
99 return svdequantize_z(pg, qv, qi.scale, qi.offset);
111 inline svfloat32x4_t svdequantize_z(svbool_t pg,
const svint8_t &qv,
float scale, int32_t
offset)
113 const auto voffset = svdup_n_s32(
offset);
114 const auto vscale = svdup_n_f32(
scale);
115 const svfloat32x4_t vdequantized_input = svcreate4_f32(
116 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
117 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
118 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
119 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
121 return vdequantized_input;
132 inline svfloat32x4_t svdequantize_z(svbool_t pg,
const svint8_t &qv,
const UniformQuantizationInfo &qi)
134 return svdequantize_z(pg, qv, qi.scale, qi.offset);
145 inline svfloat32x4_t svdequantize_z(svbool_t pg,
const svint8_t &qv,
const svfloat32x4_t vscale)
147 const svfloat32x4_t vdequantized_input =
148 svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
149 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
150 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
151 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
153 return vdequantized_input;
163 inline svfloat32x4_t svdequantize_z(svbool_t pg,
const svint8_t &qv,
float scale)
165 const auto vscale = svdup_n_f32(
scale);
166 const svfloat32x4_t vdequantized_input =
167 svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
168 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
169 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
170 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
171 return vdequantized_input;
182 inline svuint8_t svquantize_z(svbool_t pg,
const svfloat32x4_t qv,
const UniformQuantizationInfo &qi)
184 const float scale = qi.scale;
185 const int offset = qi.offset;
186 const auto voffset = svdup_n_f32(
offset);
187 const auto vinvscale = svdup_n_f32(1.f /
scale);
189 const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
190 const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
191 const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
192 const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
194 const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
195 const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
197 return svqxtnt_u16(svqxtnb_u16(pa), pb);
208 inline svint8_t svquantize_signed_z(svbool_t pg,
const svfloat32x4_t qv,
const UniformQuantizationInfo &qi)
210 const float scale = qi.scale;
211 const int offset = qi.offset;
212 const auto voffset = svdup_n_f32(
offset);
213 const auto vinvscale = svdup_n_f32(1.f /
scale);
214 const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
215 const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
216 const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
217 const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
219 const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
220 const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
222 return svqxtnt_s16(svqxtnb_s16(pa), pb);
233 inline svuint16x2_t svquantize_qasymm16_z(svbool_t pg,
const svfloat32x4_t qv,
const UniformQuantizationInfo &qi)
235 const float scale = qi.scale;
236 const int offset = qi.offset;
237 const auto voffset = svdup_n_f32(
offset);
238 const auto vinvscale = svdup_n_f32(1.f /
scale);
240 const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 0), vinvscale));
241 const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 1), vinvscale));
242 const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 2), vinvscale));
243 const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffset, svget4_f32(qv, 3), vinvscale));
245 const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
246 const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
248 return svcreate2_u16(pa, pb);
253 #endif // ARM_COMPUTE_NEASYMM_H