24 #ifndef ARM_COMPUTE_NESYMM_H
25 #define ARM_COMPUTE_NESYMM_H
53 template <
bool is_bounded_relu>
55 int32x4x2_t &in_s32,
int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16)
59 in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
60 in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
62 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
63 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
68 in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
69 in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
76 int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
80 out_s16 = vmaxq_s16(out_s16, min_s16);
81 out_s16 = vminq_s16(out_s16, max_s16);
99 template <
bool is_bounded_relu>
101 int32_t in_value,
int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16)
103 if (result_shift < 0)
105 const int64_t in_64 =
static_cast<int64_t
>(in_value) * (1 << (-result_shift)) *
106 static_cast<int64_t
>(result_fixedpoint_multiplier);
107 in_value =
static_cast<int32_t
>((in_64 + (1 << 30)) >> 31);
112 const int64_t in_64 =
static_cast<int64_t
>(in_value) *
static_cast<int64_t
>(result_fixedpoint_multiplier);
118 int16_t out_s16 =
static_cast<int16_t
>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
122 out_s16 =
static_cast<int16_t
>(std::max(min_s16, std::min(max_s16, out_s16)));
137 const float32x4_t vscale = vdupq_n_f32(
scale);
138 const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
139 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}};
140 return vdequantized_input;
152 const float32x4_t vinvscale = vdupq_n_f32(1.f /
scale);
154 const int32x4x2_t rf = {{
156 vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
158 vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
161 return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
174 const float32x4_t vscale = vdupq_n_f32(
scale);
175 const float32x4x4_t vdequantized_input = {{
176 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
177 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
178 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
179 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
181 return vdequantized_input;
195 const float32x4_t vinvscale = vdupq_n_f32(1.f /
scale);
196 const int32x4x4_t rf = {{
198 vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
199 vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
200 vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
201 vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
203 vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
204 vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
205 vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
206 vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
210 vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
211 vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
227 const auto left_shift = shift > 0 ? shift : 0;
228 const auto right_shift = shift > 0 ? 0 : -shift;
229 const auto one_shifted = 1 << left_shift;
239 #endif // ARM_COMPUTE_NESYMM_H