45 const auto window_start_x =
static_cast<int>(window.
x().
start());
46 const auto window_end_x =
static_cast<int>(window.
x().
end());
60 const auto vconst_0 = svdup_n_u8(const_0);
61 const auto vconst_1 = svdup_n_f32(1.f);
62 const auto va_f32 = svdup_n_f32(
act_info.a());
63 const auto vb_f32 = svdup_n_f32(
act_info.b());
73 auto vs = svdup_n_f32(s);
74 auto vo = svdup_n_f32(o);
77 const auto voffset_in = svdup_n_s32(qi_in.
offset);
80 const auto vs_s32 = svdup_n_s32(s_s32);
81 const auto vo_s32 = svdup_n_s32(o_s32);
87 const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
88 const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
94 const auto input_ptr =
reinterpret_cast<const uint8_t *
>(
input.ptr());
95 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.
ptr());
99 int x = window_start_x;
100 svbool_t pg = svwhilelt_b8(x, window_end_x);
103 const auto vin = svld1_u8(pg, input_ptr + x);
104 if (act == ActivationLayerInfo::ActivationFunction::RELU)
107 tmp = svmax_u8_z(pg, vconst_0, vin);
109 tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
111 else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
114 tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
116 tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
118 else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
121 tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
123 tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
125 else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
128 const auto vin_deq = svdequantize_z(pg, vin, qi_in);
130 const svfloat32x4_t tmp_dep = svcreate4_f32(
133 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
136 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
139 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
142 svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
145 tmp = svquantize_z(pg, tmp_dep, qi_out);
147 else if (act == ActivationLayerInfo::ActivationFunction::TANH)
150 const auto vin_deq = svdequantize_z(pg, vin, qi_in);
152 const svfloat32x4_t tmp_dep = svcreate4_f32(
153 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
154 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
155 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
156 svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
159 tmp = svquantize_z(pg, tmp_dep, qi_out);
161 else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
163 svbool_t p0, p1, p2, p3;
167 const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
168 svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
169 svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
170 svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
173 if (qi_in.
scale >= 0)
175 p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
176 p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
177 p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
178 p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
182 p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
183 p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
184 p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
185 p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
191 tmp_dep = svcreate4_s32(
193 svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
194 svsel(p0, vs_leaky_s32, vs_s32)),
197 svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
198 svsel(p1, vs_leaky_s32, vs_s32)),
201 svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
202 svsel(p2, vs_leaky_s32, vs_s32)),
205 svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
206 svsel(p3, vs_leaky_s32, vs_s32)),
211 tmp_dep = svcreate4_s32(
212 svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
213 svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
214 svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
215 svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
219 const auto v_low_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
220 const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
223 tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
230 svst1_u8(pg, output_ptr + x, tmp);
233 pg = svwhilelt_b8(x, window_end_x);
235 }
while (svptest_any(svptrue_b8(), pg));