24 #ifndef ARM_COMPUTE_NEASYMM_H
25 #define ARM_COMPUTE_NEASYMM_H
56 template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
69 template <RoundingPolicy round_policy = RoundingPolicy::TO_ZERO>
85 int result_fixedpoint_multiplier,
87 int32x4_t result_offset_after_shift_s32,
92 const static int32x4_t zero_s32 = vdupq_n_s32(0);
96 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
97 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
98 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
99 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
101 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
102 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
103 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
104 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
109 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
110 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
111 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
112 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
122 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
123 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
124 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
125 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
128 in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
129 in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
130 in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
131 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
134 const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
135 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
138 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
142 out_u8 = vmaxq_u8(out_u8, min_u8);
143 out_u8 = vminq_u8(out_u8, max_u8);
162 int result_fixedpoint_multiplier,
163 int32_t result_shift,
164 int32x4_t result_offset_after_shift_s32,
167 bool is_bounded_relu)
169 if (result_shift < 0)
171 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
172 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
173 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
174 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
176 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
177 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
178 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
179 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
184 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
185 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
186 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
187 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
197 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
198 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
199 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
200 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
203 const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
204 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
207 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
211 out_s8 = vmaxq_s8(out_s8, min_s8);
212 out_s8 = vminq_s8(out_s8, max_s8);
231 const int32x4x4_t &result_fixedpoint_multiplier,
232 const int32x4x4_t &result_shift,
233 const int32x4_t &result_offset_after_shift_s32,
234 const int8x16_t &min_s8,
235 const int8x16_t &max_s8,
236 const bool is_bounded_relu)
238 const static int32x4_t one_s32 = vdupq_n_s32(1);
241 int32x4x4_t res_shift_gt0 = {
242 vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
243 vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
244 vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
245 vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]),
253 int32x4x4_t res_shift_lt0 = {
254 vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
255 vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
256 vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
257 vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))),
259 res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]);
260 res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]);
261 res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]);
262 res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
265 const uint32x4x4_t mask_lt0 = {
267 vcltzq_s32(result_shift.val[0]),
268 vcltzq_s32(result_shift.val[1]),
269 vcltzq_s32(result_shift.val[2]),
270 vcltzq_s32(result_shift.val[3]),
272 vcltq_s32(result_shift.val[0], vdupq_n_s32(0)),
273 vcltq_s32(result_shift.val[1], vdupq_n_s32(0)),
274 vcltq_s32(result_shift.val[2], vdupq_n_s32(0)),
275 vcltq_s32(result_shift.val[3], vdupq_n_s32(0)),
279 in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]);
280 in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]);
281 in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]);
282 in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]);
285 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
286 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
287 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
288 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
291 const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
292 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
295 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
299 out_s8 = vmaxq_s8(out_s8, min_s8);
300 out_s8 = vminq_s8(out_s8, max_s8);
319 int result_fixedpoint_multiplier,
320 int32_t result_shift,
321 int32_t result_offset_after_shift_s32,
324 bool is_bounded_relu)
326 int32x4_t in_s32 = vdupq_n_s32(in_value);
328 if (result_shift < 0)
330 in_value = vgetq_lane_s32(
331 vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
336 in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
342 in_value += result_offset_after_shift_s32;
345 uint8_t out_u8 =
static_cast<uint8_t
>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
348 out_u8 =
static_cast<uint8_t
>(std::max(min_u8, std::min(max_u8, out_u8)));
367 int result_fixedpoint_multiplier,
368 int32_t result_shift,
369 int32_t result_offset_after_shift_s32,
372 bool is_bounded_relu)
374 int32x4_t in_s32 = vdupq_n_s32(in_value);
376 if (result_shift < 0)
378 in_value = vgetq_lane_s32(
379 vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
384 in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
391 in_value += result_offset_after_shift_s32;
394 int8_t out_s8 =
static_cast<int8_t
>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
397 out_s8 =
static_cast<int8_t
>(std::max(min_s8, std::min(max_s8, out_s8)));
414 const int32x4_t voffset = vdupq_n_s32(
offset);
415 const float32x4_t vscale = vdupq_n_f32(
scale);
416 const float32x4x2_t vdequantized_input = {{
417 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)),
419 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)),
422 return vdequantized_input;
436 const int32x4_t voffset = vdupq_n_s32(
offset);
437 const float32x4_t vscale = vdupq_n_f32(
scale);
438 const float32x4x2_t vdequantized_input = {{
439 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
440 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
442 return vdequantized_input;
456 const int32x4_t voffset = vdupq_n_s32(
offset);
457 const float32x4_t vscale = vdupq_n_f32(
scale);
458 const float32x4x4_t vdequantized_input = {{
459 vmulq_f32(vcvtq_f32_s32(
460 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
462 vmulq_f32(vcvtq_f32_s32(
463 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
465 vmulq_f32(vcvtq_f32_s32(
466 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
468 vmulq_f32(vcvtq_f32_s32(
469 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
472 return vdequantized_input;
486 const int32x4_t voffset = vdupq_n_s32(
offset);
487 const float32x4_t vscale = vdupq_n_f32(
scale);
488 const float32x4x4_t vdequantized_input = {{
489 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
490 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
491 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
492 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
494 return vdequantized_input;
507 const int32x4_t voffset = vdupq_n_s32(
offset);
508 const float32x4_t vscale = vdupq_n_f32(
scale);
509 const float32x4x4_t vdequantized_input = {{
510 vmulq_f32(vcvtq_f32_s32(
511 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
513 vmulq_f32(vcvtq_f32_s32(
514 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
516 vmulq_f32(vcvtq_f32_s32(
517 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
519 vmulq_f32(vcvtq_f32_s32(
520 vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
523 return vdequantized_input;
536 const int32x4_t voffset = vdupq_n_s32(
offset);
537 const float32x4_t vscale = vdupq_n_f32(
scale);
538 const float32x4x4_t vdequantized_input = {{
539 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
540 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
541 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
542 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
544 return vdequantized_input;
554 inline float32x4x4_t
vdequantize(
const int8x16_t &qv,
const float32x4x4_t vscale)
556 const float32x4x4_t vdequantized_input = {{
557 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
558 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
559 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
560 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
562 return vdequantized_input;
574 const float32x4_t vscale = vdupq_n_f32(
scale);
575 const float32x4x4_t vdequantized_input = {{
576 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
577 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
578 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
579 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
581 return vdequantized_input;
595 const float32x4_t voffset = vdupq_n_f32(
offset);
596 const float32x4_t vinvscale = vdupq_n_f32(1.f /
scale);
597 const int32x4x4_t rf = {{
599 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
600 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
602 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
603 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
606 return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
620 const float32x4_t voffset = vdupq_n_f32(
offset);
621 const float32x4_t vinvscale = vdupq_n_f32(1.f /
scale);
622 const int32x4x4_t rf = {{
624 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
625 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
627 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
628 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
631 return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
636 const int32x4_t voffset = vdupq_n_s32(
offset);
637 const float32x4_t vinvscale = vdupq_n_f32(1.f /
scale);
638 const int32x4x4_t rf = {{
640 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
641 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
642 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
643 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
645 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
646 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
647 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
648 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
664 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
665 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
666 return vcombine_u8(pa, pb);
679 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
680 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
681 return vcombine_s8(pa, pb);
694 const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
695 const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
701 #endif // ARM_COMPUTE_NEASYMM_H