24.02.1
|
Go to the documentation of this file.
24 #ifndef ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
25 #define ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
42 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
43 inline float16_t wrapper_vaddv(
const float16x8_t &a,
int sum_stages)
46 for (
int i = 0; i < sum_stages; ++i)
52 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
54 inline float wrapper_vaddv(
const float32x4_t &a,
int sum_stages)
57 return wrapper::vaddv(a);
64 template <
typename T,
bool IS_LOG>
77 constexpr
int vec_size = 16 /
sizeof(T);
79 const int sum_stages = log2(vec_size >> 1);
81 const auto beta_vec =
wrapper::vdup_n(
static_cast<T
>(beta), ExactTagType{});
88 const T *in_ptr =
reinterpret_cast<const T *
>(in_it.
ptr());
89 T *out_ptr =
reinterpret_cast<T *
>(out_it.
ptr());
96 auto vec_max =
wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
99 for (; x <= (input_width - vec_size); x += vec_size)
106 max_val = wrapper::vmaxv(vec_max);
110 for (
int i = 0; i < sum_stages; ++i)
116 #endif // __aarch64__
119 for (; x < input_width; ++x)
121 max_val = std::max(*(in_ptr + x), max_val);
137 for (; x <= (input_width - vec_size); x += vec_size)
157 sum = wrapper_vaddv(vec_sum, sum_stages);
160 for (
int i = 0; i < sum_stages; ++i)
165 #endif // __aarch64__
168 for (; x < input_width; ++x)
174 element = (in_ptr[x] - max_val) * beta;
175 sum += std::exp(element);
179 element = std::exp((in_ptr[x] - max_val) * beta);
183 out_ptr[x] = element;
188 sum_transformed = T(1) / sum;
192 sum_transformed =
static_cast<T
>(std::log(sum));
198 const auto sum_vec =
wrapper::vdup_n(
static_cast<T
>(sum_transformed), ExactTagType{});
202 for (; x <= (input_width - vec_size); x += vec_size)
216 for (; x < input_width; ++x)
220 out_ptr[x] = out_ptr[x] - sum_transformed;
224 out_ptr[x] = out_ptr[x] * sum_transformed;
232 template <
typename T,
bool IS_LOG>
237 #endif // ACL_SRC_CPU_KERNELS_SOFTMAX_GENERIC_NEON_IMPL_H
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
float32x4_t vexpq(const float32x4_t &a)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Interface for CPU tensor.
uint8x16_t vloadq(const uint8_t *ptr)
Includes all wrapper headers at once.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Iterator updated by execute_window_loop for each window element.
T x() const
Alias to access the size of the first dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vgetlow(const uint8x16_t val)
void vstore(uint8_t *ptr, uint8x8_t val)
Describe a multidimensional execution window.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
void neon_softmax_quantized(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
TensorShape shape
Shape of the valid region.
Copyright (c) 2017-2024 Arm Limited.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vgethigh(const uint8x16_t val)
void neon_softmax_float(const ITensor *in, void *const tmp, ITensor *out, float beta, const Window &window)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)