47 constexpr
int window_step_x = 16;
48 const auto window_start_x =
static_cast<int>(window.
x().
start());
49 const auto window_end_x =
static_cast<int>(window.
x().
end());
66 const auto vconst_1 = vdupq_n_f32(1.f);
69 const auto vconst_0_f32 = vdupq_n_f32(0);
70 #else // #ifndef __aarch64__
71 const auto const_inv_2 = vdupq_n_f32(0.5f);
72 const auto const_inv_sqrt_2 = vdupq_n_f32(0.70710678118f);
74 const float32x4_t va_f32 = vdupq_n_f32(
act_info.a());
75 const float32x4_t vb_f32 = vdupq_n_f32(
act_info.b());
80 const auto const_6_f32 = vdupq_n_f32(6.f);
81 const auto const_0_f32 = vdupq_n_f32(0.f);
82 const auto const_3_f32 = vdupq_n_f32(3.f);
83 const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
89 float32x4_t vs = vdupq_n_f32(s);
90 float32x4_t vo = vdupq_n_f32(o);
96 const auto input_ptr =
reinterpret_cast<const qasymm8_t *
>(
input.ptr());
97 const auto output_ptr =
reinterpret_cast<qasymm8_t *
>(output.
ptr());
102 int x = window_start_x;
103 for (; x <= (window_end_x - window_step_x); x += window_step_x)
106 if (act == ActivationLayerInfo::ActivationFunction::RELU)
109 tmp = vmaxq_u8(vconst_0, vin);
111 tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
113 else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
116 tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
118 tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
120 else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
123 tmp = vminq_u8(va, vmaxq_u8(vb, vin));
125 tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
127 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
128 else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
133 const float32x4x4_t tmp_dep = {{
142 #endif // __aarch64__
143 else if (act == ActivationLayerInfo::ActivationFunction::TANH)
148 const float32x4x4_t tmp_dep = {{
157 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
158 else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
163 const float32x4x4_t tmp_dep = {{
192 else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
196 const uint32x4x4_t pos_mask = {{
203 const float32x4x4_t tmp_dep = {{
212 #else // #ifndef __aarch64__
213 else if (act == ActivationLayerInfo::ActivationFunction::GELU)
217 const float32x4x4_t tmp_dep = {{
221 vin_deq.val[0], const_inv_sqrt_2))))),
225 vin_deq.val[1], const_inv_sqrt_2))))),
229 vin_deq.val[2], const_inv_sqrt_2))))),
233 vin_deq.val[3], const_inv_sqrt_2))))),
238 #endif // __aarch64__
247 for (; x < window_end_x; ++x)
251 if (act == ActivationLayerInfo::ActivationFunction::RELU)
253 tmp = std::max(const_0, in);
256 else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
258 tmp = std::min(a, std::max(const_0, in));
261 else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
263 tmp = std::min(a, std::max(
b, in));
266 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
267 else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
270 tmp_f = 1.f / (1.f + std::exp(-tmp_f));
273 #endif // __aarch64__
274 else if (act == ActivationLayerInfo::ActivationFunction::TANH)
277 tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
280 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
281 else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
284 tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
287 else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
290 tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
293 else if (act == ActivationLayerInfo::ActivationFunction::GELU)
296 tmp = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
299 #endif // __aarch64__
304 *(output_ptr + x) = tmp;