45 const auto window_start_x =
static_cast<int>(window.
x().
start());
46 const auto window_end_x =
static_cast<int>(window.
x().
end());
60 const auto vconst_0 = svdup_n_s8(const_0);
61 const auto vconst_1 = svdup_n_f32(1.f);
62 const auto va_f32 = svdup_n_f32(
act_info.a());
63 const auto vb_f32 = svdup_n_f32(
act_info.b());
64 const auto const_6_f32 = svdup_n_f32(6.f);
65 const auto const_0_f32 = svdup_n_f32(0.f);
66 const auto const_3_f32 = svdup_n_f32(3.f);
67 const auto const_inv_6_f32 = svdup_n_f32(0.166666667f);
77 auto vs = svdup_n_f32(s);
78 auto vo = svdup_n_f32(o);
81 const auto voffset_in = svdup_n_s32(qi_in.
offset);
84 const auto vs_s32 = svdup_n_s32(s_s32);
85 const auto vo_s32 = svdup_n_s32(o_s32);
91 const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
92 const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
98 const auto input_ptr =
reinterpret_cast<const int8_t *
>(
input.ptr());
99 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.
ptr());
103 int x = window_start_x;
104 svbool_t pg = svwhilelt_b8(x, window_end_x);
107 const auto vin = svld1_s8(pg, input_ptr + x);
108 if (act == ActivationLayerInfo::ActivationFunction::RELU)
111 tmp = svmax_s8_z(pg, vconst_0, vin);
113 tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
115 else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
118 tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
120 tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
122 else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
125 tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
127 tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
129 else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
132 const auto vin_deq = svdequantize_z(pg, vin, qi_in);
134 const svfloat32x4_t tmp_dep = svcreate4_f32(
137 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
140 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
143 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
146 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
148 tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
150 else if (act == ActivationLayerInfo::ActivationFunction::TANH)
153 const auto vin_deq = svdequantize_z(pg, vin, qi_in);
155 const svfloat32x4_t tmp_dep = svcreate4_f32(
156 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
157 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
158 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
159 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
161 tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
163 else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
166 const auto vin_deq = svdequantize_z(pg, vin, qi_in);
168 const svfloat32x4_t tmp_dep = svcreate4_f32(
169 svmul_f32_z(pg, svget4_f32(vin_deq, 0),
170 svmul_f32_z(pg, const_inv_6_f32,
171 svmin_f32_z(pg, const_6_f32,
172 svmax_f32_z(pg, const_0_f32,
173 svadd_f32_z(pg, svget4_f32(vin_deq, 0),
175 svmul_f32_z(pg, svget4_f32(vin_deq, 1),
176 svmul_f32_z(pg, const_inv_6_f32,
177 svmin_f32_z(pg, const_6_f32,
178 svmax_f32_z(pg, const_0_f32,
179 svadd_f32_z(pg, svget4_f32(vin_deq, 1),
181 svmul_f32_z(pg, svget4_f32(vin_deq, 2),
182 svmul_f32_z(pg, const_inv_6_f32,
183 svmin_f32_z(pg, const_6_f32,
184 svmax_f32_z(pg, const_0_f32,
185 svadd_f32_z(pg, svget4_f32(vin_deq, 2),
187 svmul_f32_z(pg, svget4_f32(vin_deq, 3),
188 svmul_f32_z(pg, const_inv_6_f32,
189 svmin_f32_z(pg, const_6_f32,
190 svmax_f32_z(pg, const_0_f32,
191 svadd_f32_z(pg, svget4_f32(vin_deq, 3),
194 tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
196 else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
198 svbool_t p0, p1, p2, p3;
202 const svint32x4_t vin_s32 =
203 svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
204 svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
207 if (qi_in.
scale >= 0)
209 p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
210 p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
211 p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
212 p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
216 p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
217 p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
218 p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
219 p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
225 tmp_dep = svcreate4_s32(
227 svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
228 svsel(p0, vs_leaky_s32, vs_s32)),
231 svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
232 svsel(p1, vs_leaky_s32, vs_s32)),
235 svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
236 svsel(p2, vs_leaky_s32, vs_s32)),
239 svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
240 svsel(p3, vs_leaky_s32, vs_s32)),
245 tmp_dep = svcreate4_s32(
246 svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
247 svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
248 svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
249 svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
253 const auto v_low_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
254 const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
257 tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
264 svst1_s8(pg, output_ptr + x, tmp);
267 pg = svwhilelt_b8(x, window_end_x);
269 }
while (svptest_any(svptrue_b8(), pg));