24 #ifndef ARM_COMPUTE_NEASYMM_H 25 #define ARM_COMPUTE_NEASYMM_H 82 int result_fixedpoint_multiplier,
84 int32x4_t result_offset_after_shift_s32,
89 const static int32x4_t zero_s32 = vdupq_n_s32(0);
93 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
94 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
95 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
96 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
98 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
99 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
100 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
101 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
106 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
107 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
108 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
109 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
119 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
120 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
121 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
122 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
125 in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
126 in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
127 in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
128 in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
131 const int16x8x2_t in_s16 =
134 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
135 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
140 uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
144 out_u8 = vmaxq_u8(out_u8, min_u8);
145 out_u8 = vminq_u8(out_u8, max_u8);
164 int result_fixedpoint_multiplier,
165 int32_t result_shift,
166 int32x4_t result_offset_after_shift_s32,
169 bool is_bounded_relu)
173 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
174 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
175 in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
176 in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
178 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
179 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
180 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
181 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
186 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
187 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
188 in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
189 in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
199 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
200 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
201 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
202 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
205 const int16x8x2_t in_s16 =
208 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
209 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
214 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
218 out_s8 = vmaxq_s8(out_s8, min_s8);
219 out_s8 = vminq_s8(out_s8, max_s8);
238 const int32x4x4_t &result_fixedpoint_multiplier,
239 const int32x4x4_t &result_shift,
240 const int32x4_t &result_offset_after_shift_s32,
241 const int8x16_t &min_s8,
242 const int8x16_t &max_s8,
243 const bool is_bounded_relu)
245 const static int32x4_t one_s32 = vdupq_n_s32(1);
248 int32x4x4_t res_shift_gt0 =
250 vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
251 vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
252 vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
253 vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]),
261 int32x4x4_t res_shift_lt0 =
263 vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
264 vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
265 vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
266 vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))),
268 res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]);
269 res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]);
270 res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]);
271 res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
274 const uint32x4x4_t mask_lt0 =
277 vcltzq_s32(result_shift.val[0]),
278 vcltzq_s32(result_shift.val[1]),
279 vcltzq_s32(result_shift.val[2]),
280 vcltzq_s32(result_shift.val[3]),
282 vcltq_s32(result_shift.val[0], vdupq_n_s32(0)),
283 vcltq_s32(result_shift.val[1], vdupq_n_s32(0)),
284 vcltq_s32(result_shift.val[2], vdupq_n_s32(0)),
285 vcltq_s32(result_shift.val[3], vdupq_n_s32(0)),
289 in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]);
290 in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]);
291 in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]);
292 in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]);
295 in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
296 in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
297 in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
298 in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
301 const int16x8x2_t in_s16 =
304 vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
305 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
310 int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
314 out_s8 = vmaxq_s8(out_s8, min_s8);
315 out_s8 = vminq_s8(out_s8, max_s8);
334 int32_t result_shift, int32_t result_offset_after_shift_s32,
335 uint8_t min_u8, uint8_t max_u8,
bool is_bounded_relu)
337 int32x4_t in_s32 = vdupq_n_s32(in_value);
341 in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
346 in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
352 in_value += result_offset_after_shift_s32;
355 uint8_t out_u8 =
static_cast<uint8_t
>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
358 out_u8 =
static_cast<uint8_t
>(std::max(min_u8, std::min(max_u8, out_u8)));
377 int32_t result_shift, int32_t result_offset_after_shift_s32,
378 int8_t min_s8, int8_t max_s8,
bool is_bounded_relu)
380 int32x4_t in_s32 = vdupq_n_s32(in_value);
384 in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
389 in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
396 in_value += result_offset_after_shift_s32;
399 int8_t out_s8 =
static_cast<int8_t
>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
402 out_s8 =
static_cast<int8_t
>(std::max(min_s8, std::min(max_s8, out_s8)));
419 const int32x4_t voffset = vdupq_n_s32(offset);
420 const float32x4_t vscale = vdupq_n_f32(scale);
421 const float32x4x2_t vdequantized_input =
424 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
425 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
428 return vdequantized_input;
442 const int32x4_t voffset = vdupq_n_s32(offset);
443 const float32x4_t vscale = vdupq_n_f32(scale);
444 const float32x4x2_t vdequantized_input =
447 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
448 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
451 return vdequantized_input;
465 const int32x4_t voffset = vdupq_n_s32(offset);
466 const float32x4_t vscale = vdupq_n_f32(scale);
467 const float32x4x4_t vdequantized_input =
470 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
471 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
472 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
473 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
476 return vdequantized_input;
490 const int32x4_t voffset = vdupq_n_s32(offset);
491 const float32x4_t vscale = vdupq_n_f32(scale);
492 const float32x4x4_t vdequantized_input =
495 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
496 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
497 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
498 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
501 return vdequantized_input;
514 const int32x4_t voffset = vdupq_n_s32(offset);
515 const float32x4_t vscale = vdupq_n_f32(scale);
516 const float32x4x4_t vdequantized_input =
519 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
520 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
521 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
522 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
525 return vdequantized_input;
538 const int32x4_t voffset = vdupq_n_s32(offset);
539 const float32x4_t vscale = vdupq_n_f32(scale);
540 const float32x4x4_t vdequantized_input =
543 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
544 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
545 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
546 vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
549 return vdequantized_input;
559 inline float32x4x4_t
vdequantize(
const int8x16_t &qv,
const float32x4x4_t vscale)
561 const float32x4x4_t vdequantized_input =
564 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
565 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
566 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
567 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
570 return vdequantized_input;
582 const float32x4_t vscale = vdupq_n_f32(scale);
583 const float32x4x4_t vdequantized_input =
586 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
587 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
588 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
589 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
592 return vdequantized_input;
606 const float32x4_t voffset = vdupq_n_f32(offset);
607 const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
608 const int32x4x4_t rf =
612 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
613 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
615 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
616 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
620 return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
634 const float32x4_t voffset = vdupq_n_f32(offset);
635 const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
636 const int32x4x4_t rf =
640 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
641 vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
643 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
644 vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
648 return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
653 const int32x4_t voffset = vdupq_n_s32(offset);
654 const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
655 const int32x4x4_t rf =
659 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
660 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
661 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
662 vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
664 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
665 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
666 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
667 vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
684 const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
685 const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
686 return vcombine_u8(pa, pb);
699 const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
700 const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
701 return vcombine_s8(pa, pb);
714 const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
715 const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
720 #endif // ARM_COMPUTE_NEASYMM_H __global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
uint8x8x2_t qasymm8x8x2_t
8 bit quantized asymmetric vector with 16 elements
uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Quantize to QASYMM16 a neon vector holding 16 floating point values.
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
int8x8x4_t qasymm8x8x4_signed_t
8 bit quantized signed asymmetric vector with 32 elements
Copyright (c) 2017-2021 Arm Limited.
uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
Perform a multiply-accumulate on all 16 components of a QASYMM8 vector.
int8x8_t qasymm8x8_signed_t
8 bit quantized signed asymmetric vector with 8 elements
int8x8x2_t qasymm8x8x2_signed_t
8 bit quantized signed asymmetric vector with 16 elements
uint8x8x3_t qasymm8x8x3_t
8 bit quantized asymmetric vector with 24 elements
int8x8x3_t qasymm8x8x3_signed_t
8 bit quantized signed asymmetric vector with 24 elements
uint8x8x4_t qasymm8x8x4_t
8 bit quantized asymmetric vector with 32 elements
uint8x8_t qasymm8x8_t
8 bit quantized asymmetric vector with 8 elements
int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector.
uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int32x4_t result_offset_after_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
Performs final quantization step on 16 elements.
int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent)
Round to the nearest division by a power-of-two using exponent.
uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
int8x16_t finalize_quantization_symm(int32x4x4_t &in_s32, const int32x4x4_t &result_fixedpoint_multiplier, const int32x4x4_t &result_shift, const int32x4_t &result_offset_after_shift_s32, const int8x16_t &min_s8, const int8x16_t &max_s8, const bool is_bounded_relu)
Performs final quantization step on 16 elements for symmetric quantization.
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
int8x16_t qasymm8x16_signed_t
8 bit quantized signed asymmetric vector with 16 elements
int32x4x4_t vquantize_internal(const float32x4x4_t &qv, float scale, int32_t offset)
uint8x16_t qasymm8x16_t
8 bit quantized asymmetric vector with 16 elements