24 #ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H 25 #define SRC_CORE_NEON_KERNELS_ELEMENTWISE_LIST_H 35 template <
typename InputScalarType,
typename OutputScalarType,
typename InputVectorType>
37 OutputScalarType (*scalar_func)(
const InputScalarType &,
const InputScalarType &),
38 int (*broadcast_func)(
int,
int,
int,
const InputScalarType *,
const InputScalarType &, OutputScalarType *,
const bool),
39 int (*neon_func)(
int,
int,
int,
const InputScalarType *,
const InputScalarType *, OutputScalarType *))
49 const int window_step_x = std::min(16 / static_cast<int>(
sizeof(OutputScalarType)), 8);
50 const auto window_start_x =
static_cast<int>(window.
x().
start());
51 const auto window_end_x =
static_cast<int>(window.
x().
end());
54 if(is_broadcast_across_x)
56 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
57 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
58 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
59 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
60 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
65 Iterator broadcast_input(broadcast_tensor, broadcast_win);
66 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
71 auto output_ptr =
reinterpret_cast<OutputScalarType *
>(output.
ptr());
72 const auto non_broadcast_input_ptr =
reinterpret_cast<const InputScalarType *
>(non_broadcast_input.
ptr());
73 const InputScalarType broadcast_value = *
reinterpret_cast<const InputScalarType *
>(broadcast_input.
ptr());
75 int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
76 for(; x < window_end_x; ++x)
78 const auto a = *(non_broadcast_input_ptr + x);
79 *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
82 broadcast_input, non_broadcast_input, output);
96 auto output_ptr =
reinterpret_cast<OutputScalarType *
>(output.
ptr());
97 const auto input1_ptr =
reinterpret_cast<const InputScalarType *
>(input1.
ptr());
98 const auto input2_ptr =
reinterpret_cast<const InputScalarType *
>(input2.
ptr());
100 int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
101 for(; x < window_end_x; ++x)
103 const auto a = *(input1_ptr + x);
104 const auto b = *(input2_ptr + x);
105 *(output_ptr + x) = (*scalar_func)(a,
b);
108 input1, input2, output);
112 template <ArithmeticOperation op,
typename ScalarType>
115 auto res = ScalarType(0);
120 res = std::max(a, b);
123 res = std::min(a, b);
127 res = (a -
b) * (a - b);
132 res = (a > 0 ? a : a *
b);
138 if(std::is_integral<ScalarType>::value)
140 res = (b == 0) ? 0 : res;
141 if(static_cast<int32_t>(a) %
static_cast<int32_t
>(
b) != 0 && ((a < 0) != (b < 0)))
150 res = std::pow(a, b);
159 template <ArithmeticOperation op,
typename VectorType>
163 using scalar_type =
typename VectorType::scalar_type;
164 using tag_type =
typename VectorType::tag_type;
166 vec_type res =
wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
184 const vec_type zero =
wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
200 inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(
const int32x4_t &a,
const int32x4_t &
b)
206 inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(
const float32x4_t &a,
const float32x4_t &
b)
212 inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(
const float32x4_t &a,
const float32x4_t &
b)
217 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 219 inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
const float16x8_t &a,
const float16x8_t &
b)
225 inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
const float16x8_t &a,
const float16x8_t &
b)
229 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 231 template <ArithmeticOperation op,
typename ScalarType,
typename VectorType>
234 using tag_type =
typename VectorType::tag_type;
237 vec_type broadcast_vector =
wrapper::vdup_n(broadcast_value, tag_type{});
238 return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
241 template <ArithmeticOperation op,
typename ScalarType,
typename VectorType>
243 const ScalarType *input1_ptr,
const ScalarType *input2_ptr, ScalarType *output_ptr)
245 int x = window_start_x;
246 for(; x <= (window_end_x - window_step_x); x += window_step_x)
250 wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a,
b));
255 template <ArithmeticOperation op,
typename ScalarType,
typename VectorType>
257 const ScalarType *non_broadcast_input_ptr,
const ScalarType &broadcast_value, ScalarType *output_ptr,
const bool reorder)
259 int x = window_start_x;
260 for(; x <= (window_end_x - window_step_x); x += window_step_x)
263 wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
268 template <ArithmeticOperation op,
typename VectorType>
271 using scalar_type =
typename VectorType::scalar_type;
273 elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
274 &elementwise_arithm_op_scalar<op, scalar_type>,
275 &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
276 &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
279 template <ComparisonOperation op,
typename InputScalarType>
307 return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
310 template <ComparisonOperation op,
typename InputVectorType,
typename OutputVectorType>
313 OutputVectorType res = { 0, 0, 0, 0 };
342 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType,
typename OutputVectorType>
346 return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
349 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
351 const InputScalarType *non_broadcast_input_ptr,
const InputScalarType &broadcast_value, uint8_t *output_ptr,
const bool reorder)
353 int x = window_start_x;
354 for(; x <= (window_end_x - window_step_x); x += window_step_x)
356 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
362 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
364 const InputScalarType *non_broadcast_input_ptr,
const InputScalarType &broadcast_value, uint8_t *output_ptr,
const bool reorder)
366 int x = window_start_x;
367 for(; x <= (window_end_x - window_step_x); x += window_step_x)
369 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
375 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
377 const InputScalarType *non_broadcast_input_ptr,
const InputScalarType &broadcast_value, uint8_t *output_ptr,
const bool reorder)
379 int x = window_start_x;
380 for(; x <= (window_end_x - window_step_x); x += window_step_x)
382 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
383 const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
386 if(x <= window_end_x - 4)
388 const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
389 for(
int i = 0; i < 4; i++)
398 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
400 const InputScalarType *input1_ptr,
const InputScalarType *input2_ptr, uint8_t *output_ptr)
402 int x = window_start_x;
403 for(; x <= (window_end_x - window_step_x); x += window_step_x)
407 const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a,
b);
413 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
415 const InputScalarType *input1_ptr,
const InputScalarType *input2_ptr, uint8_t *output_ptr)
417 int x = window_start_x;
418 for(; x <= (window_end_x - window_step_x); x += window_step_x)
422 const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a,
b);
428 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
430 const InputScalarType *input1_ptr,
const InputScalarType *input2_ptr, uint8_t *output_ptr)
432 int x = window_start_x;
433 for(; x <= (window_end_x - window_step_x); x += window_step_x)
437 const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a,
b);
440 const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a,
b);
443 if(x <= window_end_x - 4)
447 const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a,
b);
448 for(
int i = 0; i < 4; i++)
457 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
460 elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
461 &elementwise_comp_op_scalar<op, InputScalarType>,
462 &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
463 &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
466 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
469 elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
470 &elementwise_comp_op_scalar<op, InputScalarType>,
471 &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
472 &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
475 template <ComparisonOperation op,
typename InputScalarType,
typename InputVectorType>
478 elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
479 &elementwise_comp_op_scalar<op, InputScalarType>,
480 &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
481 &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
float32x2_t vdiv(const float32x2_t &a, const float32x2_t &b)
int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
uint32x2_t vmovn(const uint64x2_t &a)
OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
constexpr int step() const
Return the step of the dimension.
int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
uint8x16_t vloadq(const uint8_t *ptr)
void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Describe one of the image's dimensions with a start, end and step.
uint8_t elementwise_comp_op_scalar(const InputScalarType &a, const InputScalarType &b)
int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
float32x4_t vpow(const float32x4_t &a, const float32x4_t &b)
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
float32x4_t vfloorq_f32(float32x4_t val)
Calculate floor of a vector.
Greater equal comparison ( )
T x() const
Alias to access the size of the first dimension.
VectorType::type elementwise_arithm_op(const typename VectorType::type &a, const typename VectorType::type &b)
void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
uint8x8_t vnot(const uint8x8_t &a)
void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OutputScalarType(*scalar_func)(const InputScalarType &, const InputScalarType &), int(*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), int(*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don't advance in the dimension where shape is less equal to 1.
OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
y*x if x < 0, x otherwise
int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x, const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
ScalarType elementwise_arithm_op_scalar(const ScalarType &a, const ScalarType &b)
int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
Less equal comparison ( )
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x, const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
Includes all wrapper headers at once.
void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
constexpr int end() const
Return the end of the dimension.
uint8x8_t vcge(const uint8x8_t &a, const uint8x8_t &b)
Iterator updated by execute_window_loop for each window element.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.