24 #ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H 25 #define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H 38 template <
typename float_vec_type,
typename int_vec_type>
39 int_vec_type convert_float_to_int(
const float_vec_type &in);
41 template <
typename float_vec_type,
typename int_vec_type>
42 float_vec_type convert_int_to_float(
const int_vec_type &in);
45 uint8x16_t convert_float_to_int<float32x4x4_t, uint8x16_t>(
const float32x4x4_t &in)
53 int8x16_t convert_float_to_int<float32x4x4_t, int8x16_t>(
const float32x4x4_t &in)
61 float32x4x4_t convert_int_to_float<float32x4x4_t, uint8x16_t>(
const uint8x16_t &in)
67 float32x4x4_t convert_int_to_float<float32x4x4_t, int8x16_t>(
const int8x16_t &in)
79 constexpr
int window_step_x = 16 /
sizeof(T);
80 const auto window_start_x =
static_cast<int>(window.
x().
start());
81 const auto window_end_x =
static_cast<int>(window.
x().
end());
88 const int sum_stages = log2(window_step_x / 2);
92 const auto in_ptr =
reinterpret_cast<const T *
>(input.
ptr());
93 const auto out_ptr =
reinterpret_cast<T *
>(output.
ptr());
96 auto vec_max =
wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
97 int x = window_start_x;
99 for(; x <= (window_end_x - window_step_x); x += window_step_x)
106 for(
int i = 0; i < sum_stages; ++i)
113 for(; x < window_end_x; ++x)
115 max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
123 template <
typename T>
127 static_assert(std::is_same<T, qasymm8_t>::value
128 || std::is_same<T, qasymm8_signed_t>::value,
129 "quantized type should be either qasymm8_t or qasymm8_signed_t.");
135 const auto scale_beta_vec = vdupq_n_f32(scale_beta);
140 constexpr
int vec_size = 16;
145 const auto in_ptr =
reinterpret_cast<const T *
>(in_it.
ptr()) + start_x;
146 const auto out_ptr =
reinterpret_cast<T *
>(out_it.
ptr()) + start_x;
147 const auto tmp_ptr =
reinterpret_cast<float *
>(tmp);
150 float sum_inversed{};
155 const auto max_val = *
reinterpret_cast<const T *
>(max_it.
ptr());
159 float32x4x4_t vec_sum =
169 for(; x <= (input_width - vec_size); x += vec_size)
173 auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
177 vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
178 vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
179 vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
180 vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
181 vec_sum.val[0] = vaddq_f32(vec_sum.val[0],
vexpq_f32(vec_elements_flt.val[0]));
182 vec_sum.val[1] = vaddq_f32(vec_sum.val[1],
vexpq_f32(vec_elements_flt.val[1]));
183 vec_sum.val[2] = vaddq_f32(vec_sum.val[2],
vexpq_f32(vec_elements_flt.val[2]));
184 vec_sum.val[3] = vaddq_f32(vec_sum.val[3],
vexpq_f32(vec_elements_flt.val[3]));
188 vec_elements_flt.val[0] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
189 vec_elements_flt.val[1] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
190 vec_elements_flt.val[2] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
191 vec_elements_flt.val[3] =
vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
192 vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
193 vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
194 vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
195 vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
198 vst4q_f32(tmp_ptr + x, vec_elements_flt);
202 const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
203 auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
204 sum_res = vpadd_f32(sum_res, sum_res);
213 element = (max_val - in_ptr[x]) * scale_beta;
214 sum += std::exp(element);
218 element = std::exp((max_val - in_ptr[x]) * scale_beta);
222 tmp_ptr[x] = element;
227 sum_inversed = 256.f /
sum;
237 constexpr
bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
240 for(; x <= (input_width - vec_size); x += vec_size)
243 float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
244 int_vec_type normalized_value{};
247 const float32x4x4_t sub =
249 vsubq_f32(vec_in.val[0], vdupq_n_f32(
sum)),
250 vsubq_f32(vec_in.val[1], vdupq_n_f32(
sum)),
251 vsubq_f32(vec_in.val[2], vdupq_n_f32(
sum)),
252 vsubq_f32(vec_in.val[3], vdupq_n_f32(
sum)),
254 normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
260 vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
261 vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
262 vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
263 vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
266 if(is_qasymm8_signed)
275 normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
293 in_it, max_it, out_it);
296 template <
typename T>
298 ITensor *out,
const float beta,
bool is_log,
const Window &window)
310 constexpr
int vec_size = 16 /
sizeof(T);
311 const int sum_stages = log2(vec_size / 2);
316 const auto in_ptr =
reinterpret_cast<const T *
>(in_it.
ptr()) + start_x;
317 const auto out_ptr =
reinterpret_cast<T *
>(out_it.
ptr()) + start_x;
318 const auto tmp_ptr =
reinterpret_cast<T *
>(tmp);
326 const auto max_val = *
reinterpret_cast<const T *
>(max_it.
ptr());
334 for(; x <= (input_width - vec_size); x += vec_size)
353 for(
int i = 0; i < sum_stages; ++i)
366 element = (in_ptr[x] - max_val) * beta;
367 sum += std::exp(element);
371 element = std::exp((in_ptr[x] - max_val) * beta);
374 tmp_ptr[x] = element;
379 sum_inversed = T(1) /
sum;
383 sum =
static_cast<T
>(std::log(
sum));
391 for(; x <= (input_width - vec_size); x += vec_size)
394 auto normalized_value =
wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
410 out_ptr[x] = tmp_ptr[x] -
sum;
414 out_ptr[x] = tmp_ptr[x] * sum_inversed;
419 in_it, max_it, out_it);
TensorShape shape
Shape of the valid region.
void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
uint8x16_t vloadq(const uint8_t *ptr)
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in)
Converts from int8x16 to float32x4x4_t.
float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
Converts from uint8x16 to float32x4x4_t.
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Describe one of the image's dimensions with a start, end and step.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
T x() const
Alias to access the size of the first dimension.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
typename neon_vector< T, S >::type neon_vector_t
Helper type template to get the type of a neon vector.
void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
Converts from float32x4x4_t to just one int8x16_t.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint8x8_t vgethigh(const uint8x16_t val)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
Converts from two float32x4x4_t to just one uint8x16_t.
Includes all wrapper headers at once.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int start() const
Return the start of the dimension.
float32x4_t vexpq(const float32x4_t &a)
Describe a multidimensional execution window.
Coordinates anchor
Anchor for the start of the valid region.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.