24 #ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H 25 #define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H 42 constexpr
int window_step_x = 16 /
sizeof(T);
43 const auto window_start_x = static_cast<int>(window.
x().
start());
44 const auto window_end_x = static_cast<int>(window.
x().
end());
51 const int sum_stages = log2(window_step_x / 2);
55 const auto in_ptr = reinterpret_cast<const T *>(
input.ptr());
56 const auto out_ptr = reinterpret_cast<T *>(output.
ptr());
59 auto vec_max =
wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
60 int x = window_start_x;
62 for(; x <= (window_end_x - window_step_x); x += window_step_x)
69 for(
int i = 0; i < sum_stages; ++i)
76 for(; x < window_end_x; ++x)
78 max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
90 static_assert(std::is_same<T, qasymm8_t>::value
91 || std::is_same<T, qasymm8_signed_t>::value,
92 "quantized type should be either qasymm8_t or qasymm8_signed_t.");
98 const auto scale_beta_vec = vdupq_n_f32(scale_beta);
103 constexpr
int vec_size = 16;
108 const auto in_ptr = reinterpret_cast<const T *>(in_it.
ptr()) + start_x;
109 const auto out_ptr = reinterpret_cast<T *>(out_it.
ptr()) + start_x;
110 const auto tmp_ptr = reinterpret_cast<float *>(tmp);
113 float sum_inversed{};
118 const auto max_val = *reinterpret_cast<const T *>(max_it.
ptr());
122 float32x4x4_t vec_sum =
132 for(; x <= (
input_width - vec_size); x += vec_size)
136 auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
140 vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
141 vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
142 vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
143 vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
144 vec_sum.val[0] = vaddq_f32(vec_sum.val[0],
vexpq_f32(vec_elements_flt.val[0]));
145 vec_sum.val[1] = vaddq_f32(vec_sum.val[1],
vexpq_f32(vec_elements_flt.val[1]));
146 vec_sum.val[2] = vaddq_f32(vec_sum.val[2],
vexpq_f32(vec_elements_flt.val[2]));
147 vec_sum.val[3] = vaddq_f32(vec_sum.val[3],
vexpq_f32(vec_elements_flt.val[3]));
151 vec_elements_flt.val[0] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
152 vec_elements_flt.val[1] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
153 vec_elements_flt.val[2] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
154 vec_elements_flt.val[3] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
155 vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
156 vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
157 vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
158 vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
161 vst4q_f32(tmp_ptr + x, vec_elements_flt);
165 const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
166 auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
167 sum_res = vpadd_f32(sum_res, sum_res);
176 element = (max_val - in_ptr[x]) * scale_beta;
177 sum += std::exp(element);
181 element = std::exp((max_val - in_ptr[x]) * scale_beta);
185 tmp_ptr[x] = element;
190 sum_inversed = 256.f / sum;
200 constexpr
bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
203 for(; x <= (
input_width - vec_size); x += vec_size)
206 float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
207 int_vec_type normalized_value{};
210 const float32x4x4_t sub =
212 vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
213 vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
214 vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
215 vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
217 normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
223 vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
224 vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
225 vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
226 vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
229 if(is_qasymm8_signed)
238 normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
247 out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
251 out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
256 in_it, max_it, out_it);
259 template <
typename T>
261 ITensor *out,
const float beta,
bool is_log,
const Window &window)
273 constexpr
int vec_size = 16 /
sizeof(T);
274 const int sum_stages = log2(vec_size / 2);
279 const auto in_ptr = reinterpret_cast<const T *>(in_it.
ptr()) + start_x;
280 const auto out_ptr = reinterpret_cast<T *>(out_it.
ptr()) + start_x;
281 const auto tmp_ptr = reinterpret_cast<T *>(tmp);
289 const auto max_val = *reinterpret_cast<const T *>(max_it.
ptr());
297 for(; x <= (
input_width - vec_size); x += vec_size)
316 for(
int i = 0; i < sum_stages; ++i)
329 element = (in_ptr[x] - max_val) * beta;
330 sum += std::exp(element);
334 element = std::exp((in_ptr[x] - max_val) * beta);
337 tmp_ptr[x] = element;
342 sum_inversed = T(1) / sum;
346 sum = static_cast<T>(std::log(sum));
354 for(; x <= (
input_width - vec_size); x += vec_size)
357 auto normalized_value =
wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
373 out_ptr[x] = tmp_ptr[x] - sum;
377 out_ptr[x] = tmp_ptr[x] * sum_inversed;
382 in_it, max_it, out_it);
TensorShape shape
Shape of the valid region.
void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
uint8x16_t vloadq(const uint8_t *ptr)
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Describe one of the image's dimensions with a start, end and step.
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
T x() const
Alias to access the size of the first dimension.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
typename neon_vector< T, S >::type neon_vector_t
Helper type template to get the type of a neon vector.
void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint8x8_t vgethigh(const uint8x16_t val)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int start() const
Return the start of the dimension.
float32x4_t vexpq(const float32x4_t &a)
Describe a multidimensional execution window.
Coordinates anchor
Anchor for the start of the valid region.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.