45 constexpr
int window_step_x = 16;
46 const auto window_start_x =
static_cast<int>(window.
x().
start());
47 const auto window_end_x =
static_cast<int>(window.
x().
end());
65 const auto vconst_1 = vdupq_n_f32(1.f);
66 const auto vconst_0_f32 = vdupq_n_f32(0.f);
68 const float32x4_t va_f32 = vdupq_n_f32(
act_info.a());
69 const float32x4_t vb_f32 = vdupq_n_f32(
act_info.b());
72 const auto const_6_f32 = vdupq_n_f32(6.f);
73 const auto const_0_f32 = vdupq_n_f32(0.f);
74 const auto const_3_f32 = vdupq_n_f32(3.f);
75 const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
80 float32x4_t vs = vdupq_n_f32(s);
81 float32x4_t vo = vdupq_n_f32(o);
93 int x = window_start_x;
94 for (; x <= (window_end_x - window_step_x); x += window_step_x)
97 if (act == ActivationLayerInfo::ActivationFunction::RELU)
100 tmp = vmaxq_s8(vconst_0, vin);
102 tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
104 else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
107 tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
109 tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
111 else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
114 tmp = vminq_s8(va, vmaxq_s8(vb, vin));
116 tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
118 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
119 else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
124 const float32x4x4_t tmp_dep = {{
133 #endif // __aarch64__
134 else if (act == ActivationLayerInfo::ActivationFunction::TANH)
139 const float32x4x4_t tmp_dep = {{
148 else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
153 const float32x4x4_t tmp_dep = {{
182 else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
187 const uint32x4x4_t pos_mask = {{
188 wrapper::vcgtz(vin_deq.val[0]),
189 wrapper::vcgtz(vin_deq.val[1]),
190 wrapper::vcgtz(vin_deq.val[2]),
191 wrapper::vcgtz(vin_deq.val[3]),
194 const uint32x4x4_t pos_mask = {{
200 #endif // __aarch64__
202 const float32x4x4_t tmp_dep = {{
219 for (; x < window_end_x; ++x)
223 if (act == ActivationLayerInfo::ActivationFunction::RELU)
225 tmp = std::max(const_0, in);
228 else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
230 tmp = std::min(a, std::max(const_0, in));
233 else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
235 tmp = std::min(a, std::max(
b, in));
238 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
239 else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
242 tmp_f = 1.f / (1.f + std::exp(-tmp_f));
245 #endif // __aarch64__
246 else if (act == ActivationLayerInfo::ActivationFunction::TANH)
249 tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
252 else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
255 tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
258 else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
261 tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
268 *(output_ptr + x) = tmp;