Compute Library
 21.05
arm_compute::cpu Namespace Reference

Namespaces

 kernels
 

Data Structures

class  CpuActivation
 Basic function to run kernels::CpuActivationKernel. More...
 
class  CpuAdd
 Basic function to run kernels::CpuAddKernel. More...
 
struct  CpuCapabilities
 Structure that encodes the CPU capabilities to be used. More...
 
class  CpuComplexMul
 Basic function to run kernels::CpuComplexMulKernel. More...
 
class  CpuConcatenate
 Basic function to execute concatenate tensors along a given axis. More...
 
class  CpuContext
 CPU context implementation class. More...
 
class  CpuConvertFullyConnectedWeights
 Basic function to run kernels::CpuConvertFullyConnectedWeightsKernel. More...
 
class  CpuCopy
 Basic function to run kernels::CpuCopyKernel. More...
 
class  CpuDepthwiseConvolution
 Function to execute a depthwise convolution. More...
 
class  CpuDepthwiseConvolutionAssemblyDispatch
 Depthwise convolution assembly kernel glue. More...
 
class  CpuDequantization
 Basic function to run kernels::CpuDequantizationKernel that dequantizes an input tensor. More...
 
class  CpuDirectConvolution
 Function to run the direct convolution. More...
 
class  CpuElementwiseArithmetic
 Class to run cpu::kernels::CpuArithmeticKernel except for division and power. More...
 
class  CpuElementwiseBase
 
class  CpuElementwiseComparison
 Basic function to run cpu::kernels::CpuComparisonKernel. More...
 
class  CpuElementwiseComparisonStatic
 Basic function to run cpu::kernels::CpuComparisonKernel. More...
 
class  CpuElementwiseDivision
 Basic function to run cpu::kernels::CpuArithmeticKernel for division. More...
 
class  CpuElementwisePower
 Basic function to run cpu::kernels::CpuArithmeticKernel for power. More...
 
class  CpuElementwiseUnary
 
class  CpuFill
 Basic function to run kernels::CpuFillKernel. More...
 
class  CpuFloor
 Basic function to run kernels::CpuFloorKernel. More...
 
class  CpuLogits1DSoftmaxKernel
 
class  CpuMul
 Basic function to run kernels::CpuMulKernel. More...
 
class  CpuPermute
 Basic function to run kernels::CpuPermuteKernel. More...
 
class  CpuPooling
 Basic function to simulate a pooling layer with the specified pooling operation. More...
 
class  CpuPoolingAssemblyDispatch
 Basic function to run pooling assembly kernels. More...
 
class  CpuQuantization
 Basic function to simulate a quantization layer. More...
 
class  CpuQueue
 CPU queue implementation class. More...
 
class  CpuReshape
 Basic function to run kernels::CpuReshapeKernel. More...
 
class  CpuScale
 Basic function to compute Scale. More...
 
class  CpuSoftmaxGeneric
 Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. More...
 
class  CpuSub
 Basic function to run kernels::CpuSubKernel. More...
 
class  CpuTensor
 CPU tensor implementation class. More...
 
class  CpuTranspose
 Basic function to run kernels::CpuTransposeKernel. More...
 

Typedefs

using ICpuKernel = arm_compute::ICPPKernel
 
using ICpuOperator = experimental::INEOperator
 
using CpuElementwiseMax = CpuElementwiseArithmetic< ArithmeticOperation::MAX >
 Class to run cpu::kernels::CpuArithmeticKernel except for maximum operation. More...
 
using CpuElementwiseMin = CpuElementwiseArithmetic< ArithmeticOperation::MIN >
 Class to run cpu::kernels::CpuArithmeticKernel except for minimum operation. More...
 
using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic< ArithmeticOperation::SQUARED_DIFF >
 Class to run cpu::kernels::CpuArithmeticKernel except for squared difference operation. More...
 
using NEEqual = CpuElementwiseComparisonStatic< ComparisonOperation::Equal >
 Basic function to run equal comparison. More...
 
using NENotEqual = CpuElementwiseComparisonStatic< ComparisonOperation::NotEqual >
 Basic function to run not equal comparison. More...
 
using NEGreater = CpuElementwiseComparisonStatic< ComparisonOperation::Greater >
 Basic function to run greater comparison. More...
 
using NEGreaterEqual = CpuElementwiseComparisonStatic< ComparisonOperation::GreaterEqual >
 Basic function to run greater-equal comparison. More...
 
using NELess = CpuElementwiseComparisonStatic< ComparisonOperation::Less >
 Basic function to run less comparison. More...
 
using NELessEqual = CpuElementwiseComparisonStatic< ComparisonOperation::LessEqual >
 Basic function to run less-equal comparison. More...
 
using KernelType = kernels::CpuElementwiseUnaryKernel
 
using CpuPRelu = CpuElementwiseArithmetic< ArithmeticOperation::PRELU >
 Class to run cpu::kernels::CpuArithmeticKernel except for PRelu operation. More...
 
using CpuSoftmax = CpuSoftmaxGeneric< false >
 
using CpuLogSoftmax = CpuSoftmaxGeneric< true >
 

Functions

void qasymm8_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qasymm8_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qasymm8_signed_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qasymm8_signed_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qsymm16_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qsymm16_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp16_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp16_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp32_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp32_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void add_u8_u8_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s16_u8_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_u8_s16_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_signed_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qsymm16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename ScalarType >
void add_same_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename InputScalarType , typename OutputScalarType , typename InputVectorType >
void elementwise_op (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OutputScalarType(*scalar_func)(const InputScalarType &, const InputScalarType &), int(*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), int(*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
 
template<ArithmeticOperation op, typename ScalarType >
ScalarType elementwise_arithm_op_scalar (const ScalarType &a, const ScalarType &b)
 
template<ArithmeticOperation op, typename VectorType >
VectorType::type elementwise_arithm_op (const typename VectorType::type &a, const typename VectorType::type &b)
 
template<>
int32x4_t elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > > (const int32x4_t &a, const int32x4_t &b)
 
template<>
float32x4_t elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > > (const float32x4_t &a, const float32x4_t &b)
 
template<>
float32x4_t elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > > (const float32x4_t &a, const float32x4_t &b)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
VectorType::type elementwise_arithm_op_broadcast (const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
int elementwise_arithm_op_loop (int window_start_x, int window_end_x, int window_step_x, const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
int elementwise_arithm_op_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
 
template<ArithmeticOperation op, typename VectorType >
void elementwise_arithm_op (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType >
uint8_t elementwise_comp_op_scalar (const InputScalarType &a, const InputScalarType &b)
 
template<ComparisonOperation op, typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comp_op (const InputVectorType &a, const InputVectorType &b)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comp_op_broadcast (const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_8_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_16_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_32_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_8_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_16_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_32_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_8 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_16 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_32 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
float32x4x4_t load_quantized (const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 
float32x4x4_t load_quantized_signed (const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 
void store_quantized (uint8_t *output_ptr, const uint32x4x4_t &out)
 
void store_quantized (uint8_t *output_ptr, const int32x4x4_t &out)
 
void store_quantized (uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 
void store_quantized_signed (int8_t *output_ptr, const int32x4x4_t &out)
 
void store_quantized_signed (int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 
template<ArithmeticOperation op>
uint8_t elementwise_arithm_op_quantized_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ArithmeticOperation op>
int8_t elementwise_arithm_op_quantized_signed_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ArithmeticOperation op>
float32x4x4_t elementwise_arithm_op (const float32x4x4_t &a, const float32x4x4_t &b)
 
template<ComparisonOperation op>
uint8_t elementwise_comp_op_quantized_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ComparisonOperation op>
uint32x4x4_t elementwise_comp_op (const float32x4x4_t &a, const float32x4x4_t &b)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_singed_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_signed_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_signed_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_signed_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
void elementwise_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
void elementwise_comp_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
void elementwise_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
template<ArithmeticOperation op>
void elementwise_arithm_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void elementwise_arithm_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void elementwise_comp_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void elementwise_comp_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<typename ScalarType >
ScalarType elementwise_op_scalar_imp (ElementWiseUnary op, const ScalarType &a)
 
template<typename ScalarType , typename VectorType >
VectorType elementwise_op_imp (ElementWiseUnary op, const VectorType &a)
 
template<typename ScalarType >
void elementwise_op (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void fp16_neon_floor (const void *src, void *dst, int len)
 
void fp32_neon_floor (const void *src, void *dst, int len)
 
void poolingMxN_fp32_neon_nhwc (const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 
void poolingMxN_qasymm8_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
void poolingMxN_qasymm8_signed_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
void poolingMxN_fp16_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
template<typename T >
uint32_t offset_no_padding (uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
 
template<typename T >
std::enable_if< std::is_same< T, int8_t >::value, int8_t >::type quantize (float val, const UniformQuantizationInfo &info)
 
template<typename T >
std::enable_if< std::is_same< T, uint8_t >::value, uint8_t >::type quantize (float val, const UniformQuantizationInfo &info)
 
template<typename T >
vcvtq_q32_f32 (float32x4_t values)
 
template<>
uint32x4_t vcvtq_q32_f32 (float32x4_t values)
 
template<>
int32x4_t vcvtq_q32_f32 (float32x4_t values)
 
template<typename T >
float32x4_t vcvtq_f32_q32 (T values)
 
template<>
float32x4_t vcvtq_f32_q32 (uint32x4_t values)
 
template<>
float32x4_t vcvtq_f32_q32 (int32x4_t values)
 
template<typename Tout >
Tout vrequantize_pooling_with_scale (const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
 
template<>
uint8x16_t vrequantize_pooling_with_scale (const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
 
template<>
int8x16_t vrequantize_pooling_with_scale (const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
 
template<typename Tin , typename Tout >
Tout vrequantize_pooling (Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo)
 
template<>
uint8x16_t vrequantize_pooling (uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 
template<>
int8x16_t vrequantize_pooling (int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 
template<typename T >
vrequantize_pooling (T &vec, const UniformQuantizationInfo &requant_qinfo)
 
template<>
uint8x8_t vrequantize_pooling (uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 
template<>
int8x8_t vrequantize_pooling (int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 
float calculate_avg_scale (bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 
template<typename T >
void poolingMxN_q8_neon_nhwc (const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 
void u8_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void s16_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_signed_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void nearest_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void bilinear_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void common_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void fp16_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void fp32_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void s16_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void u8_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_signed_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void neon_logits_1d_max (const ITensor *in, ITensor *out, const Window &window)
 
template<typename T >
void neon_softmax_logits_1d_quantized (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template<typename T >
void neon_softmax_logits_1d_float (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void sub_s16_u8_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_u8_s16_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_u8_u8_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qasymm8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qasymm8_signed_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qsymm16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename T >
void sub_same_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void fp16_neon_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp16_sve_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp32_neon_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp32_sve_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 

Variables

constexpr int step = 4
 

Typedef Documentation

◆ CpuElementwiseMax

Class to run cpu::kernels::CpuArithmeticKernel except for maximum operation.

Definition at line 67 of file CpuElementwise.h.

◆ CpuElementwiseMin

Class to run cpu::kernels::CpuArithmeticKernel except for minimum operation.

Definition at line 69 of file CpuElementwise.h.

◆ CpuElementwiseSquaredDiff

Class to run cpu::kernels::CpuArithmeticKernel except for squared difference operation.

Definition at line 71 of file CpuElementwise.h.

◆ CpuLogSoftmax

Definition at line 101 of file CpuSoftmax.h.

◆ CpuPRelu

Class to run cpu::kernels::CpuArithmeticKernel except for PRelu operation.

Definition at line 34 of file CpuPRelu.h.

◆ CpuSoftmax

using CpuSoftmax = CpuSoftmaxGeneric<false>

Definition at line 100 of file CpuSoftmax.h.

◆ ICpuKernel

Definition at line 33 of file ICpuKernel.h.

◆ ICpuOperator

Definition at line 33 of file ICpuOperator.h.

◆ KernelType

◆ NEEqual

Basic function to run equal comparison.

Definition at line 182 of file CpuElementwise.h.

◆ NEGreater

Basic function to run greater comparison.

Definition at line 186 of file CpuElementwise.h.

◆ NEGreaterEqual

Basic function to run greater-equal comparison.

Definition at line 188 of file CpuElementwise.h.

◆ NELess

Basic function to run less comparison.

Definition at line 190 of file CpuElementwise.h.

◆ NELessEqual

Basic function to run less-equal comparison.

Definition at line 192 of file CpuElementwise.h.

◆ NENotEqual

Basic function to run not equal comparison.

Definition at line 184 of file CpuElementwise.h.

Function Documentation

◆ add_qasymm8_neon()

void add_qasymm8_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qasymm8.cpp.

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
85 
86  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
87  const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
93 
94  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
95 
96  // Compute S elements per iteration
97  int x = window_start_x;
98  for(; x <= (window_end_x - window_step_x); x += window_step_x)
99  {
100  const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151 
152  execute_window_loop(win, [&](const Coordinates &)
153  {
154  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
155  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
156  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
157 
158  // Compute S elements per iteration
159  int x = window_start_x;
160  for(; x <= (window_end_x - window_step_x); x += window_step_x)
161  {
162  const uint8x16_t a = vld1q_u8(input1_ptr + x);
163  const uint8x16_t b = vld1q_u8(input2_ptr + x);
164 
165  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
166  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
167  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
168  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
169 
170  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
171  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
172  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
173  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
174 
175  int32x4_t rf_0{};
176  int32x4_t rf_1{};
177  int32x4_t rf_2{};
178  int32x4_t rf_3{};
179 
180 #ifdef __aarch64__
181  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
182  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
183  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
184  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
185 #else //__aarch64__
186  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
187  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
188  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
189  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
190 #endif //__aarch64__
191 
192  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
193  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
194  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
195  }
196 
197  // Compute left-over elements
198  for(; x < window_end_x; ++x)
199  {
200  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
201  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
202  *(output_ptr + x) = quantize_qasymm8((afs + bfs), dst->info()->quantization_info());
203  }
204  },
205  input1, input2, output);
206  }
207 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
uchar quantize_qasymm8(float input, float offset, float scale)
Quantize a floating-point scalar value to 8-bit asymmetric.
Definition: helpers_asymm.h:47
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

◆ add_qasymm8_signed_neon()

void add_qasymm8_signed_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qasymm8_signed.cpp.

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
85 
86  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
87  const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
93  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
94 
95  // Compute S elements per iteration
96  int x = window_start_x;
97  for(; x <= (window_end_x - window_step_x); x += window_step_x)
98  {
99  const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
100 
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151  execute_window_loop(win, [&](const Coordinates &)
152  {
153  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
154  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
155  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
156 
157  // Compute S elements per iteration
158  int x = window_start_x;
159  for(; x <= (window_end_x - window_step_x); x += window_step_x)
160  {
161  const int8x16_t a = vld1q_s8(input1_ptr + x);
162  const int8x16_t b = vld1q_s8(input2_ptr + x);
163 
164  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
165  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
166  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
167  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
168 
169  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
170  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
171  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
172  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
173 
174  int32x4_t rf_0{};
175  int32x4_t rf_1{};
176  int32x4_t rf_2{};
177  int32x4_t rf_3{};
178 
179 #ifdef __aarch64__
180  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
181  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
182  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
183  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
184 #else //__aarch64__
185  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
186  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
187  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
188  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
189 #endif //__aarch64__
190 
191  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
192  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
193  vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
194  }
195 
196  // Compute left-over elements
197  for(; x < window_end_x; ++x)
198  {
199  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
200  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
201  *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info());
202  }
203  },
204  input1, input2, output);
205  }
206 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8_signed(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

◆ add_qsymm16_neon()

void add_qsymm16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qsymm16.cpp.

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 8;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
57  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
58  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
59 
60  if(is_broadcast_across_x)
61  {
62  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
63  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
64  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
65  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
66  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
67  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
68  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
69 
70  // Clear X Dimension on execution window as we handle manually
71  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
72 
73  Iterator broadcast_input(broadcast_tensor, broadcast_win);
74  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
75  Iterator output(dst, win);
76 
77  execute_window_loop(win, [&](const Coordinates &)
78  {
79  const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
80  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
81 
82  const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
83  const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
84 
85  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
86  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
87  const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
88 
89  // Compute S elements per iteration
90  int x = window_start_x;
91  for(; x <= (window_end_x - window_step_x); x += window_step_x)
92  {
93  const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
94  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
95  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
96 
97  int32x4_t rf_0{};
98  int32x4_t rf_1{};
99 #ifdef __aarch64__
100  rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
101  rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
102 #else //__aarch64__
103  rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
104  rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
105 #endif //__aarch64__
106 
107  const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
108  vst1q_s16(output_ptr + x, pa);
109  }
110 
111  // Compute left-over elements
112  for(; x < window_end_x; ++x)
113  {
114  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
115  *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
116  }
117  },
118  broadcast_input, non_broadcast_input, output);
119  }
120  else
121  {
122  // Clear X Dimension on execution window as we handle manually
123  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
124  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
125 
126  Iterator input1(src0, input1_win);
127  Iterator input2(src1, input2_win);
128  Iterator output(dst, win);
129 
130  execute_window_loop(win, [&](const Coordinates &)
131  {
132  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
133  const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
134  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
135 
136  // Compute S elements per iteration
137  int x = window_start_x;
138  for(; x <= (window_end_x - window_step_x); x += window_step_x)
139  {
140  const int16x8_t a = vld1q_s16(input1_ptr + x);
141  const int16x8_t b = vld1q_s16(input2_ptr + x);
142 
143  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
144  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
145  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
146  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
147 
148  int32x4_t rf_0{};
149  int32x4_t rf_1{};
150 #ifdef __aarch64__
151  rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
152  rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
153 #else //__aarch64__
154  rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
155  rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
156 #endif //__aarch64__
157 
158  const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
159  vst1q_s16(output_ptr + x, pa);
160  }
161 
162  // Compute left-over elements
163  for(; x < window_end_x; ++x)
164  {
165  const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
166  const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
167  *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
168  }
169  },
170  input1, input2, output);
171  }
172 }
int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit symmetric quantization scheme.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qsymm16(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

◆ add_s16_u8_s16_neon()

void add_s16_u8_s16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 100 of file integer.cpp.

101 {
102  // Create input windows
103  Window win = window;
104  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
105  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
106 
107  // Clear X Dimension on execution window as we handle manually
108  win.set(Window::DimX, Window::Dimension(0, 1, 1));
109  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
110  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
111 
112  Iterator input1(src0, input1_win);
113  Iterator input2(src1, input2_win);
114  Iterator output(dst, win);
115 
116  const int window_step_x = 8;
117  const auto window_start_x = static_cast<int>(window.x().start());
118  const auto window_end_x = static_cast<int>(window.x().end());
119 
120  execute_window_loop(win, [&](const Coordinates &)
121  {
122  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
123  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
124  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
125 
126  if(policy == ConvertPolicy::WRAP)
127  {
128  // Compute S elements per iteration
129  int x = window_start_x;
130  for(; x <= (window_end_x - window_step_x); x += window_step_x)
131  {
132  const auto vin1 = wrapper::vloadq(input1_ptr + x);
133  const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
134  wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
135  }
136 
137  // Compute left-over elements
138  for(; x < window_end_x; ++x)
139  {
140  *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
141  }
142  }
143  else
144  {
145  // Compute S elements per iteration
146  int x = window_start_x;
147  for(; x <= (window_end_x - window_step_x); x += window_step_x)
148  {
149  const auto vin1 = wrapper::vloadq(input1_ptr + x);
150  const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
151  wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
152  }
153 
154  // Compute left-over elements
155  for(; x < window_end_x; ++x)
156  {
157  *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
158  }
159  }
160  },
161  input1, input2, output);
162 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39

References arm_compute::wrapper::add_sat(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), Window::set(), Window::Dimension::start(), ITensorInfo::tensor_shape(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vload(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vqadd(), arm_compute::wrapper::vstore(), arm_compute::WRAP, and Window::x().

Referenced by add_u8_s16_s16_neon().

◆ add_same_neon()

void arm_compute::cpu::add_same_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

SIMD vector tag type.

Definition at line 48 of file list.h.

49 {
50  /** SIMD vector tag type. */
51  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
52 
53  // Create input windows
54  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
55  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
56 
57  // Clear X Dimension on execution window as we handle manually
58  Window win = window;
59  win.set(Window::DimX, Window::Dimension(0, 1, 1));
60 
61  constexpr int window_step_x = 16 / sizeof(ScalarType);
62  const auto window_start_x = static_cast<int>(window.x().start());
63  const auto window_end_x = static_cast<int>(window.x().end());
64  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
65 
66  if(is_broadcast_across_x)
67  {
68  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
69  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
70  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
71  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
72  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
85 
86  const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
87  const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
88 
89  // Compute S elements per iteration
90  int x = window_start_x;
91  for(; x <= (window_end_x - window_step_x); x += window_step_x)
92  {
93  const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
94  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
95  wrapper::vstore(output_ptr + x, res);
96  }
97 
98  // Compute left-over elements
99  for(; x < window_end_x; ++x)
100  {
101  const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
102  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
103  }
104  },
105  broadcast_input, non_broadcast_input, output);
106  }
107  else
108  {
109  // Clear X Dimension on execution window as we handle manually
110  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
111  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
112 
113  Iterator input1(src0, input1_win);
114  Iterator input2(src1, input2_win);
115  Iterator output(dst, win);
116 
117  execute_window_loop(win, [&](const Coordinates &)
118  {
119  const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
120  const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
121  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
122 
123  // Compute S elements per iteration
124  int x = window_start_x;
125  for(; x <= (window_end_x - window_step_x); x += window_step_x)
126  {
127  const auto val1 = wrapper::vloadq(input1_ptr + x);
128  const auto val2 = wrapper::vloadq(input2_ptr + x);
129  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
130  wrapper::vstore(output_ptr + x, res);
131  }
132 
133  // Compute left-over elements
134  for(; x < window_end_x; ++x)
135  {
136  const auto val1 = *(input1_ptr + x);
137  const auto val2 = *(input2_ptr + x);
138  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
139  }
140  },
141  input1, input2, output);
142  }
143 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

References arm_compute::wrapper::add_sat(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), arm_compute::SATURATE, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vqadd(), arm_compute::wrapper::vstore(), Dimensions< T >::x(), and Window::x().

◆ add_u8_s16_s16_neon()

void add_u8_s16_s16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 164 of file integer.cpp.

165 {
166  // Simply swap the two input buffers:
167  add_s16_u8_s16_neon(src1, src0, dst, policy, window);
168 }
void add_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Definition: integer.cpp:100

References add_s16_u8_s16_neon(), and arm_compute::test::validation::dst.

◆ add_u8_u8_s16_neon()

void add_u8_u8_s16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file integer.cpp.

36 {
37  // Create input windows
38  Window win = window;
39  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
40  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
41 
42  // Clear X Dimension on execution window as we handle manually
43  win.set(Window::DimX, Window::Dimension(0, 1, 1));
44  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
45  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  Iterator input1(src0, input1_win);
48  Iterator input2(src1, input2_win);
49  Iterator output(dst, win);
50 
51  const int window_step_x = 8;
52  const auto window_start_x = static_cast<int>(window.x().start());
53  const auto window_end_x = static_cast<int>(window.x().end());
54 
55  execute_window_loop(win, [&](const Coordinates &)
56  {
57  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
58  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
59  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
60 
61  if(policy == ConvertPolicy::WRAP)
62  {
63  // Compute S elements per iteration
64  int x = window_start_x;
65  for(; x <= (window_end_x - window_step_x); x += window_step_x)
66  {
67  const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
68  const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
69  wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
70  }
71 
72  // Compute left-over elements
73  for(; x < window_end_x; ++x)
74  {
75  *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
76  }
77  }
78  else
79  {
80  // Compute S elements per iteration
81  int x = window_start_x;
82  for(; x <= (window_end_x - window_step_x); x += window_step_x)
83  {
84  const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
85  const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
86  wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
87  }
88 
89  // Compute left-over elements
90  for(; x < window_end_x; ++x)
91  {
92  *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
93  static_cast<int16_t>(*(input2_ptr + x)));
94  }
95  }
96  },
97  input1, input2, output);
98 }
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39

References arm_compute::wrapper::add_sat(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), Window::set(), Window::Dimension::start(), ITensorInfo::tensor_shape(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vload(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vqadd(), arm_compute::wrapper::vstore(), arm_compute::WRAP, and Window::x().

◆ bilinear_neon_scale()

void arm_compute::cpu::bilinear_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 94 of file list.h.

97 {
98  // Compute the ratio between source height and destination height
99  const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
100 
101  Iterator out(dst, window);
102  const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
103  const int in_dim_w = src->info()->dimension(1);
104  const int in_dim_h = src->info()->dimension(2);
105  const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
106 
107  // Don't increment in Y and Z direction for the input tensor
108  // A pointer to the start of this plane is needed as base for the precomputed offsets
109  Window win_in(window);
110  win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
111  win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
112  Iterator in(src, win_in);
113 
114  if(border_mode == BorderMode::CONSTANT)
115  {
116 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
117  using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
118 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
119  using ConstType = T;
120 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
121  const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
122  execute_window_loop(window, [&](const Coordinates & id)
123  {
124  const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
125  const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
126  const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
127  const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
128  const T *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
129 
130  const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
131  const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
132  const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
133  const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
134 
135  *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
136  },
137  in, out);
138  }
139  else if(border_mode == BorderMode::REPLICATE)
140  {
141  execute_window_loop(window, [&](const Coordinates & id)
142  {
143  const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
144  const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
145  const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
146  const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
147 
148  auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
149  auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
150  auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
151  auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
152 
153  const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
154  const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
155  const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
156  const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
157 
158  *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
159  },
160  in, out);
161  }
162  else
163  {
164  ARM_COMPUTE_ERROR("Not implemented");
165  }
166 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
half_float::half half
16-bit floating point type
Definition: Types.h:46
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Definition: DFT.cpp:155
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners=false)
Returns resize ratio between input and output with consideration of aligned corners.
Definition: ScaleUtils.cpp:27
float delta_bilinear(float a00, float a01, float a10, float a11, float dx_val, float dy_val)
Computes bilinear interpolation using the top-left, top-right, bottom-left, bottom-right pixels and t...
Definition: ScaleHelpers.h:343

References ARM_COMPUTE_ERROR, arm_compute::scale_utils::calculate_resize_ratio(), arm_compute::CONSTANT, arm_compute::scale_helpers::delta_bilinear(), Window::DimY, Window::DimZ, arm_compute::test::validation::dst, arm_compute::execute_window_loop(), PixelValue::get(), offset(), Iterator::ptr(), ITensor::ptr_to_element(), arm_compute::REPLICATE, arm_compute::test::validation::src, and type.

◆ calculate_avg_scale()

float arm_compute::cpu::calculate_avg_scale ( bool  exclude_padding,
DataLayout  data_layout,
const Coordinates id,
const int  pool_size_x,
const int  pool_size_y,
const int  upper_bound_w,
const int  upper_bound_h,
const int  pad_x,
const int  pad_y,
const int  stride_x,
const int  stride_y 
)
inline

Definition at line 162 of file quantized.h.

164 {
165  const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
166  const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
167 
168  int start_x = id[idx_width] * stride_x - pad_x;
169  int start_y = id[idx_height] * stride_y - pad_y;
170 
171  const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
172  const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
173  if(exclude_padding)
174  {
175  start_x = std::max(0, start_x);
176  start_y = std::max(0, start_y);
177  }
178  return 1.f / ((end_y - start_y) * (end_x - start_x));
179 }
const DataLayout data_layout
Definition: Im2Col.cpp:151
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193

References arm_compute::test::validation::data_layout, arm_compute::get_data_layout_dimension_index(), arm_compute::HEIGHT, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, and arm_compute::WIDTH.

Referenced by poolingMxN_fp32_neon_nhwc(), and poolingMxN_q8_neon_nhwc().

◆ common_neon_scale()

void arm_compute::cpu::common_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 169 of file list.h.

172 {
173  if(policy == InterpolationPolicy::BILINEAR)
174  {
175  bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
176  }
177  else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
178  {
179  nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
180  }
181 }
SimpleTensor< float > src
Definition: DFT.cpp:155

References arm_compute::BILINEAR, arm_compute::test::validation::dst, arm_compute::NEAREST_NEIGHBOR, and arm_compute::test::validation::src.

◆ elementwise_arithm_op() [1/3]

float32x4x4_t arm_compute::cpu::elementwise_arithm_op ( const float32x4x4_t &  a,
const float32x4x4_t &  b 
)
inline

Definition at line 125 of file elementwise_quantized_list.h.

126 {
127  using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
128  float32x4x4_t out =
129  {
130  {
131  elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
132  elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
133  elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
134  elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
135  }
136  };
137  return out;
138 }
SimpleTensor< float > b
Definition: DFT.cpp:157

References arm_compute::test::validation::b.

◆ elementwise_arithm_op() [2/3]

VectorType::type arm_compute::cpu::elementwise_arithm_op ( const typename VectorType::type a,
const typename VectorType::type b 
)
inline

Definition at line 160 of file elementwise_list.h.

161 {
162  using vec_type = typename VectorType::type;
163  using scalar_type = typename VectorType::scalar_type;
164  using tag_type = typename VectorType::tag_type;
165 
166  vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
167 
168  switch(op)
169  {
171  res = wrapper::vmax(a, b);
172  break;
174  res = wrapper::vmin(a, b);
175  break;
177  {
178  const vec_type tmp = wrapper::vsub(a, b);
179  res = wrapper::vmul(tmp, tmp);
180  break;
181  }
183  {
184  const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
185  const vec_type tmp = wrapper::vmul(a, b);
186  const auto gt = wrapper::vcgt(a, zero);
187 
188  res = wrapper::vbsl(gt, a, tmp);
189  break;
190  }
191 
192  default:
193  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
194  }
195 
196  return res;
197 }
#define PRELU(x, y)
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define MAX(x, y)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
#define MIN(x, y)
decltype(strategy::transforms) typedef type
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: bsl.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
#define SQUARED_DIFF(x, y)

References ARM_COMPUTE_ERROR, arm_compute::test::validation::b, arm_compute::MAX, arm_compute::MIN, arm_compute::PRELU, arm_compute::SQUARED_DIFF, type, arm_compute::wrapper::vbsl(), arm_compute::wrapper::vcgt(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmin(), arm_compute::wrapper::vmul(), and arm_compute::wrapper::vsub().

◆ elementwise_arithm_op() [3/3]

void arm_compute::cpu::elementwise_arithm_op ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 269 of file elementwise_list.h.

270 {
271  using scalar_type = typename VectorType::scalar_type;
272 
273  elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
274  &elementwise_arithm_op_scalar<op, scalar_type>,
275  &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
276  &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
277 }

◆ elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > >()

float32x4_t arm_compute::cpu::elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > > ( const float32x4_t &  a,
const float32x4_t &  b 
)
inline

Definition at line 206 of file elementwise_list.h.

207 {
208  return wrapper::vdiv(a, b);
209 }
float32x2_t vdiv(const float32x2_t &a, const float32x2_t &b)
Definition: div.h:58
SimpleTensor< float > b
Definition: DFT.cpp:157

References arm_compute::test::validation::b, and arm_compute::wrapper::vdiv().

◆ elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > >()

int32x4_t arm_compute::cpu::elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > > ( const int32x4_t &  a,
const int32x4_t &  b 
)
inline

Definition at line 200 of file elementwise_list.h.

201 {
202  return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
203 }
float32x2_t vdiv(const float32x2_t &a, const float32x2_t &b)
Definition: div.h:58
SimpleTensor< float > b
Definition: DFT.cpp:157
float32x4_t vfloorq_f32(float32x4_t val)
Calculate floor of a vector.

References arm_compute::test::validation::b, arm_compute::wrapper::vdiv(), and arm_compute::vfloorq_f32().

◆ elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > >()

float32x4_t arm_compute::cpu::elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > > ( const float32x4_t &  a,
const float32x4_t &  b 
)
inline

Definition at line 212 of file elementwise_list.h.

213 {
214  return wrapper::vpow(a, b);
215 }
SimpleTensor< float > b
Definition: DFT.cpp:157
float32x4_t vpow(const float32x4_t &a, const float32x4_t &b)
Definition: pow.h:40

References arm_compute::test::validation::b, and arm_compute::wrapper::vpow().

◆ elementwise_arithm_op_broadcast()

VectorType::type arm_compute::cpu::elementwise_arithm_op_broadcast ( const typename VectorType::type a,
const ScalarType &  broadcast_value,
const bool  reorder 
)
inline

Definition at line 232 of file elementwise_list.h.

233 {
234  using tag_type = typename VectorType::tag_type;
235  using vec_type = typename VectorType::type;
236 
237  vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
238  return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
239 }
decltype(strategy::transforms) typedef type
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References type, and arm_compute::wrapper::vdup_n().

◆ elementwise_arithm_op_broadcast_loop()

int arm_compute::cpu::elementwise_arithm_op_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const ScalarType *  non_broadcast_input_ptr,
const ScalarType &  broadcast_value,
ScalarType *  output_ptr,
const bool  reorder 
)
inline

Definition at line 256 of file elementwise_list.h.

258 {
259  int x = window_start_x;
260  for(; x <= (window_end_x - window_step_x); x += window_step_x)
261  {
262  const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
263  wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
264  }
265  return x;
266 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

◆ elementwise_arithm_op_loop()

int arm_compute::cpu::elementwise_arithm_op_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const ScalarType *  input1_ptr,
const ScalarType *  input2_ptr,
ScalarType *  output_ptr 
)
inline

Definition at line 242 of file elementwise_list.h.

244 {
245  int x = window_start_x;
246  for(; x <= (window_end_x - window_step_x); x += window_step_x)
247  {
248  const auto a = wrapper::vloadq(input1_ptr + x);
249  const auto b = wrapper::vloadq(input2_ptr + x);
250  wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
251  }
252  return x;
253 }
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

References arm_compute::test::validation::b, arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

◆ elementwise_arithm_op_quantized()

void arm_compute::cpu::elementwise_arithm_op_quantized ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 622 of file elementwise_quantized_list.h.

623 {
624  elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
625  &elementwise_arithm_op_quantized_broadcast_loop<op>,
626  &elementwise_arithm_op_quantized_loop<op>);
627 }
void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

References elementwise_op_quantized().

◆ elementwise_arithm_op_quantized_broadcast_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
uint8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 199 of file elementwise_quantized_list.h.

203 {
204  int x = window_start_x;
205  for(; x <= (window_end_x - window_step_x); x += window_step_x)
206  {
207  const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
208  const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
209  store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
210  }
211  return x;
212 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References load_quantized(), and store_quantized().

◆ elementwise_arithm_op_quantized_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  input1_ptr,
const uint8_t *  input2_ptr,
uint8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 163 of file elementwise_quantized_list.h.

167 {
168  int x = window_start_x;
169  for(; x <= (window_end_x - window_step_x); x += window_step_x)
170  {
171  // Get inputs and compute output
172  const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
173  const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
174  const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
175  store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
176  }
177  return x;
178 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)

References load_quantized(), and store_quantized().

◆ elementwise_arithm_op_quantized_scalar()

uint8_t arm_compute::cpu::elementwise_arithm_op_quantized_scalar ( const float &  a,
const float &  b,
UniformQuantizationInfo  qinfo 
)
inline

Definition at line 113 of file elementwise_quantized_list.h.

114 {
115  return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
116 }
SimpleTensor< float > b
Definition: DFT.cpp:157
uchar quantize_qasymm8(float input, float offset, float scale)
Quantize a floating-point scalar value to 8-bit asymmetric.
Definition: helpers_asymm.h:47
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

References arm_compute::test::validation::b, arm_compute::test::validation::qinfo, and arm_compute::quantize_qasymm8().

◆ elementwise_arithm_op_quantized_signed()

void arm_compute::cpu::elementwise_arithm_op_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 629 of file elementwise_quantized_list.h.

630 {
631  elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
632  &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
633  &elementwise_arithm_op_quantized_singed_loop<op>);
634 }
void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

References elementwise_op_quantized_signed().

◆ elementwise_arithm_op_quantized_signed_broadcast_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_signed_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
int8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 214 of file elementwise_quantized_list.h.

218 {
219  int x = window_start_x;
220  for(; x <= (window_end_x - window_step_x); x += window_step_x)
221  {
222  const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
223  const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
224  store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
225  }
226  return x;
227 }
void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References load_quantized_signed(), and store_quantized_signed().

◆ elementwise_arithm_op_quantized_signed_scalar()

int8_t arm_compute::cpu::elementwise_arithm_op_quantized_signed_scalar ( const float &  a,
const float &  b,
UniformQuantizationInfo  qinfo 
)
inline

Definition at line 119 of file elementwise_quantized_list.h.

120 {
121  return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
122 }
SimpleTensor< float > b
Definition: DFT.cpp:157
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

References arm_compute::test::validation::b, arm_compute::test::validation::qinfo, and arm_compute::quantize_qasymm8_signed().

◆ elementwise_arithm_op_quantized_singed_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_singed_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  input1_ptr,
const int8_t *  input2_ptr,
int8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 181 of file elementwise_quantized_list.h.

185 {
186  int x = window_start_x;
187  for(; x <= (window_end_x - window_step_x); x += window_step_x)
188  {
189  // Get inputs and compute output
190  const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
191  const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
192  const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
193  store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
194  }
195  return x;
196 }
void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)

References load_quantized_signed(), and store_quantized_signed().

◆ elementwise_arithm_op_scalar()

ScalarType arm_compute::cpu::elementwise_arithm_op_scalar ( const ScalarType &  a,
const ScalarType &  b 
)
inline

Definition at line 113 of file elementwise_list.h.

114 {
115  auto res = ScalarType(0);
116 
117  switch(op)
118  {
120  res = std::max(a, b);
121  break;
123  res = std::min(a, b);
124  break;
126  {
127  res = (a - b) * (a - b);
128  break;
129  }
131  {
132  res = (a > 0 ? a : a * b);
133  break;
134  }
136  {
137  res = a / b;
138  if(std::is_integral<ScalarType>::value)
139  {
140  res = (b == 0) ? 0 : res;
141  if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
142  {
143  --res;
144  }
145  }
146  break;
147  }
148  case ArithmeticOperation::POWER:
149  {
150  res = std::pow(a, b);
151  break;
152  }
153  default:
154  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
155  }
156  return res;
157 }
#define PRELU(x, y)
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define DIV(x, y)
#define MAX(x, y)
#define MIN(x, y)
#define SQUARED_DIFF(x, y)

References ARM_COMPUTE_ERROR, arm_compute::test::validation::b, arm_compute::DIV, arm_compute::MAX, arm_compute::MIN, arm_compute::POWER, arm_compute::PRELU, and arm_compute::SQUARED_DIFF.

◆ elementwise_comp_op() [1/2]

uint32x4x4_t arm_compute::cpu::elementwise_comp_op ( const float32x4x4_t &  a,
const float32x4x4_t &  b 
)
inline

Definition at line 148 of file elementwise_quantized_list.h.

149 {
150  uint32x4x4_t out =
151  {
152  {
153  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
154  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
155  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
156  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
157  }
158  };
159  return out;
160 }
SimpleTensor< float > b
Definition: DFT.cpp:157

References arm_compute::test::validation::b.

◆ elementwise_comp_op() [2/2]

OutputVectorType arm_compute::cpu::elementwise_comp_op ( const InputVectorType &  a,
const InputVectorType &  b 
)
inline

Definition at line 311 of file elementwise_list.h.

312 {
313  OutputVectorType res = { 0, 0, 0, 0 };
314 
315  switch(op)
316  {
317  case ComparisonOperation::Equal:
318  res = wrapper::vceq(a, b);
319  break;
320  case ComparisonOperation::NotEqual:
321  res = wrapper::vnot(wrapper::vceq(a, b));
322  break;
323  case ComparisonOperation::Greater:
324  res = wrapper::vcgt(a, b);
325  break;
326  case ComparisonOperation::GreaterEqual:
327  res = wrapper::vcge(a, b);
328  break;
329  case ComparisonOperation::Less:
330  res = wrapper::vcgt(b, a);
331  break;
332  case ComparisonOperation::LessEqual:
333  res = wrapper::vcge(b, a);
334  break;
335  default:
336  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
337  }
338 
339  return res;
340 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x8_t vnot(const uint8x8_t &a)
Definition: not.h:39
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vcge(const uint8x8_t &a, const uint8x8_t &b)
Definition: cge.h:39
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
Definition: ceq.h:39

References ARM_COMPUTE_ERROR, arm_compute::test::validation::b, arm_compute::Equal, arm_compute::Greater, arm_compute::GreaterEqual, arm_compute::Less, arm_compute::LessEqual, arm_compute::NotEqual, arm_compute::wrapper::vceq(), arm_compute::wrapper::vcge(), arm_compute::wrapper::vcgt(), and arm_compute::wrapper::vnot().

◆ elementwise_comp_op_16()

void arm_compute::cpu::elementwise_comp_op_16 ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 467 of file elementwise_list.h.

468 {
469  elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
470  &elementwise_comp_op_scalar<op, InputScalarType>,
471  &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
472  &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
473 }

◆ elementwise_comp_op_16_loop()

int arm_compute::cpu::elementwise_comp_op_16_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  input1_ptr,
const InputScalarType *  input2_ptr,
uint8_t *  output_ptr 
)
inline

Definition at line 414 of file elementwise_list.h.

416 {
417  int x = window_start_x;
418  for(; x <= (window_end_x - window_step_x); x += window_step_x)
419  {
420  const auto a = wrapper::vloadq(input1_ptr + x);
421  const auto b = wrapper::vloadq(input2_ptr + x);
422  const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
423  wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
424  }
425  return x;
426 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

References arm_compute::test::validation::b, arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

◆ elementwise_comp_op_32()

void arm_compute::cpu::elementwise_comp_op_32 ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 476 of file elementwise_list.h.

477 {
478  elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
479  &elementwise_comp_op_scalar<op, InputScalarType>,
480  &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
481  &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
482 }

◆ elementwise_comp_op_32_loop()

int arm_compute::cpu::elementwise_comp_op_32_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  input1_ptr,
const InputScalarType *  input2_ptr,
uint8_t *  output_ptr 
)
inline

Definition at line 429 of file elementwise_list.h.

431 {
432  int x = window_start_x;
433  for(; x <= (window_end_x - window_step_x); x += window_step_x)
434  {
435  auto a = wrapper::vloadq(input1_ptr + x);
436  auto b = wrapper::vloadq(input2_ptr + x);
437  const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
438  a = wrapper::vloadq(input1_ptr + x + 4);
439  b = wrapper::vloadq(input2_ptr + x + 4);
440  const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
442  }
443  if(x <= window_end_x - 4)
444  {
445  const auto a = wrapper::vloadq(input1_ptr + x);
446  const auto b = wrapper::vloadq(input2_ptr + x);
447  const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
448  for(int i = 0; i < 4; i++)
449  {
450  *(output_ptr + x + i) = wrapper::vgetlane(res, i);
451  }
452  x = +4;
453  }
454  return x;
455 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

References arm_compute::test::validation::b, arm_compute::wrapper::vcombine(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

◆ elementwise_comp_op_8()

void arm_compute::cpu::elementwise_comp_op_8 ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 458 of file elementwise_list.h.

459 {
460  elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
461  &elementwise_comp_op_scalar<op, InputScalarType>,
462  &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
463  &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
464 }

◆ elementwise_comp_op_8_loop()

int arm_compute::cpu::elementwise_comp_op_8_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  input1_ptr,
const InputScalarType *  input2_ptr,
uint8_t *  output_ptr 
)
inline

Definition at line 399 of file elementwise_list.h.

401 {
402  int x = window_start_x;
403  for(; x <= (window_end_x - window_step_x); x += window_step_x)
404  {
405  const auto a = wrapper::vloadq(input1_ptr + x);
406  const auto b = wrapper::vloadq(input2_ptr + x);
407  const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
408  wrapper::vstore(output_ptr + x, res);
409  }
410  return x;
411 }
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

References arm_compute::test::validation::b, arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

◆ elementwise_comp_op_broadcast()

OutputVectorType arm_compute::cpu::elementwise_comp_op_broadcast ( const InputVectorType &  a,
const InputScalarType &  broadcast_value,
const bool  reorder 
)
inline

Definition at line 343 of file elementwise_list.h.

344 {
345  InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
346  return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
347 }
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References arm_compute::wrapper::vdup_n().

◆ elementwise_comp_op_broadcast_16_loop()

int arm_compute::cpu::elementwise_comp_op_broadcast_16_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  non_broadcast_input_ptr,
const InputScalarType &  broadcast_value,
uint8_t *  output_ptr,
const bool  reorder 
)
inline

Definition at line 363 of file elementwise_list.h.

365 {
366  int x = window_start_x;
367  for(; x <= (window_end_x - window_step_x); x += window_step_x)
368  {
369  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
370  wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
371  }
372  return x;
373 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

◆ elementwise_comp_op_broadcast_32_loop()

int arm_compute::cpu::elementwise_comp_op_broadcast_32_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  non_broadcast_input_ptr,
const InputScalarType &  broadcast_value,
uint8_t *  output_ptr,
const bool  reorder 
)
inline

Definition at line 376 of file elementwise_list.h.

378 {
379  int x = window_start_x;
380  for(; x <= (window_end_x - window_step_x); x += window_step_x)
381  {
382  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
383  const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
385  }
386  if(x <= window_end_x - 4)
387  {
388  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
389  for(int i = 0; i < 4; i++)
390  {
391  *(output_ptr + x + i) = wrapper::vgetlane(a, i);
392  }
393  x = +4;
394  }
395  return x;
396 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References arm_compute::test::validation::b, arm_compute::wrapper::vcombine(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

◆ elementwise_comp_op_broadcast_8_loop()

int arm_compute::cpu::elementwise_comp_op_broadcast_8_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  non_broadcast_input_ptr,
const InputScalarType &  broadcast_value,
uint8_t *  output_ptr,
const bool  reorder 
)
inline

Definition at line 350 of file elementwise_list.h.

352 {
353  int x = window_start_x;
354  for(; x <= (window_end_x - window_step_x); x += window_step_x)
355  {
356  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
357  wrapper::vstore(output_ptr + x, a);
358  }
359  return x;
360 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

◆ elementwise_comp_op_quantized()

void arm_compute::cpu::elementwise_comp_op_quantized ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 637 of file elementwise_quantized_list.h.

638 {
639  elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
640  &elementwise_comp_op_quantized_broadcast_loop<op>,
641  &elementwise_comp_op_quantized_loop<op>);
642 }
void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

References elementwise_op_quantized().

◆ elementwise_comp_op_quantized_broadcast_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
uint8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 266 of file elementwise_quantized_list.h.

270 {
271  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
272  int x = window_start_x;
273  for(; x <= (window_end_x - window_step_x); x += window_step_x)
274  {
275  const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
276  const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
277  store_quantized(output_ptr + x, rf);
278  }
279  return x;
280 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References ARM_COMPUTE_UNUSED, load_quantized(), and store_quantized().

◆ elementwise_comp_op_quantized_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  input1_ptr,
const uint8_t *  input2_ptr,
uint8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 230 of file elementwise_quantized_list.h.

234 {
235  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
236  int x = window_start_x;
237  for(; x <= (window_end_x - window_step_x); x += window_step_x)
238  {
239  const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
240  const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
241  const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
242  store_quantized(output_ptr + x, rf);
243  }
244  return x;
245 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

References ARM_COMPUTE_UNUSED, load_quantized(), and store_quantized().

◆ elementwise_comp_op_quantized_scalar()

uint8_t arm_compute::cpu::elementwise_comp_op_quantized_scalar ( const float &  a,
const float &  b,
UniformQuantizationInfo  qinfo 
)
inline

Definition at line 141 of file elementwise_quantized_list.h.

142 {
144  return elementwise_comp_op_scalar<op>(a, b);
145 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, and arm_compute::test::validation::qinfo.

◆ elementwise_comp_op_quantized_signed()

void arm_compute::cpu::elementwise_comp_op_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 645 of file elementwise_quantized_list.h.

646 {
647  elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
648  &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
649  &elementwise_comp_op_quantized_signed_loop<op>);
650 }
void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

References elementwise_comp_quantized_signed().

◆ elementwise_comp_op_quantized_signed_broadcast_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_signed_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
uint8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 283 of file elementwise_quantized_list.h.

287 {
288  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
289  int x = window_start_x;
290  for(; x <= (window_end_x - window_step_x); x += window_step_x)
291  {
292  const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
293  const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
294  store_quantized(output_ptr + x, rf);
295  }
296  return x;
297 }
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

References ARM_COMPUTE_UNUSED, load_quantized_signed(), and store_quantized().

◆ elementwise_comp_op_quantized_signed_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_signed_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  input1_ptr,
const int8_t *  input2_ptr,
uint8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 248 of file elementwise_quantized_list.h.

252 {
253  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
254  int x = window_start_x;
255  for(; x <= (window_end_x - window_step_x); x += window_step_x)
256  {
257  const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
258  const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
259  const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
260  store_quantized(output_ptr + x, rf);
261  }
262  return x;
263 }
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)

References ARM_COMPUTE_UNUSED, load_quantized_signed(), and store_quantized().

◆ elementwise_comp_op_scalar()

uint8_t arm_compute::cpu::elementwise_comp_op_scalar ( const InputScalarType &  a,
const InputScalarType &  b 
)
inline

Definition at line 280 of file elementwise_list.h.

281 {
282  bool res = false;
283 
284  switch(op)
285  {
286  case ComparisonOperation::Equal:
287  res = (a == b);
288  break;
289  case ComparisonOperation::NotEqual:
290  res = (a != b);
291  break;
292  case ComparisonOperation::Greater:
293  res = (a > b);
294  break;
295  case ComparisonOperation::GreaterEqual:
296  res = (a >= b);
297  break;
298  case ComparisonOperation::Less:
299  res = (a < b);
300  break;
301  case ComparisonOperation::LessEqual:
302  res = (a <= b);
303  break;
304  default:
305  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
306  }
307  return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
308 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

References ARM_COMPUTE_ERROR, arm_compute::test::validation::b, arm_compute::Equal, arm_compute::Greater, arm_compute::GreaterEqual, arm_compute::Less, arm_compute::LessEqual, and arm_compute::NotEqual.

◆ elementwise_comp_quantized_signed()

void arm_compute::cpu::elementwise_comp_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
uint8_t(*)(const float &, const float &, UniformQuantizationInfo scalar_func,
int(*)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool)  broadcast_func,
int(*)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)  neon_func 
)

Definition at line 407 of file elementwise_quantized_list.h.

414 {
415  // Create input windows
416  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
417  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
418 
419  // Clear X Dimension on execution window as we handle manually
420  Window win = window;
421  win.set(Window::DimX, Window::Dimension(0, 1, 1));
422 
423  const int window_step_x = 16;
424  const auto window_start_x = static_cast<int>(window.x().start());
425  const auto window_end_x = static_cast<int>(window.x().end());
426  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
427 
428  const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
429 
430  const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
431  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
432 
433  if(is_broadcast_across_x)
434  {
435  // Select the broadcast input on the X axis
436  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
437  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
438  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
439  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
440  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
441 
442  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
443  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
444 
445  const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
446  const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
447 
448  // Clear X Dimension on execution window as we handle manually
449  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
450 
451  Iterator broadcast_input(broadcast_tensor, broadcast_win);
452  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
453  Iterator output(out, win);
454 
455  execute_window_loop(win, [&](const Coordinates &)
456  {
457  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
458  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
459 
460  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
461  const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
462 
463  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
464  voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
465  for(; x < window_end_x; ++x)
466  {
467  const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
468  const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
469  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
470  }
471  },
472  broadcast_input, non_broadcast_input, output);
473  }
474  else
475  {
476  const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
477  const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
478 
479  // Input1 quantization info
480  const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
481  const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
482 
483  // Input2 quantization info
484  const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
485  const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
486 
487  // Clear X Dimension on execution window as we handle manually
488  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
489  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
490 
491  Iterator input1(in1, input1_win);
492  Iterator input2(in2, input2_win);
493  Iterator output(out, win);
494 
495  execute_window_loop(win, [&](const Coordinates &)
496  {
497  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
498  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
499  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
500 
501  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
502  vscale1, vscale2, voffseto, invvscaleo);
503  for(; x < window_end_x; ++x)
504  {
505  const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
506  const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
507  *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
508  }
509  },
510  input1, input2, output);
511  }
512 }
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
float dequantize_qasymm8_signed(char input, float offset, float scale)
Dequantize a scalar value from signed 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:75
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

References arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), arm_compute::dequantize_qasymm8_signed(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), arm_compute::vdequantize(), Dimensions< T >::x(), and Window::x().

Referenced by elementwise_comp_op_quantized_signed().

◆ elementwise_op() [1/2]

void arm_compute::cpu::elementwise_op ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
OutputScalarType(*)(const InputScalarType &, const InputScalarType &)  scalar_func,
int(*)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool)  broadcast_func,
int(*)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)  neon_func 
)

Definition at line 36 of file elementwise_list.h.

40 {
41  // Create input windows
42  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
43  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
44 
45  // Clear X Dimension on execution window as we handle manually
46  Window win = window;
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
50  const auto window_start_x = static_cast<int>(window.x().start());
51  const auto window_end_x = static_cast<int>(window.x().end());
52  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
53 
54  if(is_broadcast_across_x)
55  {
56  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
57  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
58  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
59  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
60  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
61 
62  // Clear X Dimension on execution window as we handle manually
63  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
64 
65  Iterator broadcast_input(broadcast_tensor, broadcast_win);
66  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
67  Iterator output(out, win);
68 
69  execute_window_loop(win, [&](const Coordinates &)
70  {
71  auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
72  const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
73  const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
74 
75  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
76  for(; x < window_end_x; ++x)
77  {
78  const auto a = *(non_broadcast_input_ptr + x);
79  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
80  }
81  },
82  broadcast_input, non_broadcast_input, output);
83  }
84  else
85  {
86  // Clear X Dimension on execution window as we handle manually
87  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
88  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
89 
90  Iterator input1(in1, input1_win);
91  Iterator input2(in2, input2_win);
92  Iterator output(out, win);
93 
94  execute_window_loop(win, [&](const Coordinates &)
95  {
96  auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
97  const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
98  const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
99 
100  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
101  for(; x < window_end_x; ++x)
102  {
103  const auto a = *(input1_ptr + x);
104  const auto b = *(input2_ptr + x);
105  *(output_ptr + x) = (*scalar_func)(a, b);
106  }
107  },
108  input1, input2, output);
109  }
110 }
SimpleTensor< float > b
Definition: DFT.cpp:157
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

References arm_compute::test::validation::b, Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), Dimensions< T >::x(), and Window::x().

◆ elementwise_op() [2/2]

void arm_compute::cpu::elementwise_op ( const ITensor in,
ITensor out,
const Window window,
ElementWiseUnary  op 
)

Definition at line 83 of file elementwise_unary_list.h.

84 {
85  const int window_step_x = 16 / sizeof(ScalarType);
86  const auto window_start_x = static_cast<int>(window.x().start());
87  const auto window_end_x = static_cast<int>(window.x().end());
88 
89  Window win = window;
90  win.set(Window::DimX, Window::Dimension(0, 1, 1));
91 
92  Iterator input(in, win);
93  Iterator output(out, win);
94 
95  execute_window_loop(win, [&](const Coordinates &)
96  {
97  auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
98  const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
99 
100  int x = window_start_x;
101  for(; x <= window_end_x - window_step_x; x += window_step_x)
102  {
103  wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
104  }
105  for(; x < window_end_x; ++x)
106  {
107  *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
108  }
109  },
110  input, output);
111 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

References Window::DimX, elementwise_op_scalar_imp(), Window::Dimension::end(), arm_compute::execute_window_loop(), arm_compute::test::validation::input, Iterator::ptr(), Window::set(), Window::Dimension::start(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vstore(), and Window::x().

◆ elementwise_op_imp()

VectorType arm_compute::cpu::elementwise_op_imp ( ElementWiseUnary  op,
const VectorType &  a 
)
inline

Definition at line 59 of file elementwise_unary_list.h.

60 {
61  switch(op)
62  {
63  case ElementWiseUnary::RSQRT:
64  return wrapper::vinvsqrt(a);
65  case ElementWiseUnary::EXP:
66  return wrapper::vexpq(a);
67  case ElementWiseUnary::NEG:
68  return wrapper::vneg(a);
69  case ElementWiseUnary::LOG:
70  return wrapper::vlog(a);
71  case ElementWiseUnary::ABS:
72  return wrapper::vabs(a);
73  case ElementWiseUnary::ROUND:
74  return wrapper::vround(a);
75  case ElementWiseUnary::SIN:
76  return wrapper::vsin(a);
77  default:
78  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
79  }
80 }
float32x4_t vlog(const float32x4_t &a)
Definition: log.h:47
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x2_t vinvsqrt(const float32x2_t &a)
Definition: invsqrt.h:47
int8x8_t vabs(const int8x8_t &a)
Definition: abs.h:46
float32x4_t vsin(const float32x4_t &a)
Definition: sin.h:47
int8x8_t vneg(const int8x8_t &a)
Definition: neg.h:39
float32x4_t vround(const float32x4_t &a)
Definition: round.h:47
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47

References arm_compute::ABS, ARM_COMPUTE_ERROR, arm_compute::EXP, arm_compute::LOG, arm_compute::NEG, arm_compute::ROUND, arm_compute::RSQRT, arm_compute::SIN, arm_compute::wrapper::vabs(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vinvsqrt(), arm_compute::wrapper::vlog(), arm_compute::wrapper::vneg(), arm_compute::wrapper::vround(), and arm_compute::wrapper::vsin().

◆ elementwise_op_quantized()

void arm_compute::cpu::elementwise_op_quantized ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
uint8_t(*)(const float &, const float &, UniformQuantizationInfo scalar_func,
int(*)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool)  broadcast_func,
int(*)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)  neon_func 
)

Definition at line 299 of file elementwise_quantized_list.h.

306 {
307  // Create input windows
308  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
309  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
310 
311  // Clear X Dimension on execution window as we handle manually
312  Window win = window;
313  win.set(Window::DimX, Window::Dimension(0, 1, 1));
314 
315  const int window_step_x = 16;
316  const auto window_start_x = static_cast<int>(window.x().start());
317  const auto window_end_x = static_cast<int>(window.x().end());
318  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
319 
320  const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
321 
322  // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
323  const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f);
324  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
325 
326  if(is_broadcast_across_x)
327  {
328  // Select the broadcast input on the X axis
329  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
330  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
331  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
332  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
333  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
334 
335  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
336  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
337 
338  const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
339  const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
340 
341  // Clear X Dimension on execution window as we handle manually
342  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
343 
344  Iterator broadcast_input(broadcast_tensor, broadcast_win);
345  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
346  Iterator output(out, win);
347 
348  execute_window_loop(win, [&](const Coordinates &)
349  {
350  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
351  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
352 
353  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
354  const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
355 
356  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
357  voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
358  for(; x < window_end_x; ++x)
359  {
360  const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
361  const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
362  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
363  }
364  },
365  broadcast_input, non_broadcast_input, output);
366  }
367  else
368  {
369  const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
370  const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
371 
372  // Input1 quantization info
373  const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
374  const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
375 
376  // Input2 quantization info
377  const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
378  const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
379 
380  // Clear X Dimension on execution window as we handle manually
381  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
382  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
383 
384  Iterator input1(in1, input1_win);
385  Iterator input2(in2, input2_win);
386  Iterator output(out, win);
387 
388  execute_window_loop(win, [&](const Coordinates &)
389  {
390  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
391  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
392  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
393 
394  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
395  vscale1, vscale2, voffseto, invvscaleo);
396  for(; x < window_end_x; ++x)
397  {
398  const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
399  const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
400  *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
401  }
402  },
403  input1, input2, output);
404  }
405 }
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
float dequantize_qasymm8(uchar input, float offset, float scale)
Dequantize a scalar value from 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:62
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

References arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), arm_compute::dequantize_qasymm8(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), arm_compute::vdequantize(), Dimensions< T >::x(), and Window::x().

Referenced by elementwise_arithm_op_quantized(), and elementwise_comp_op_quantized().

◆ elementwise_op_quantized_signed()

void arm_compute::cpu::elementwise_op_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
int8_t(*)(const float &, const float &, UniformQuantizationInfo scalar_func,
int(*)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool)  broadcast_func,
int(*)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)  neon_func 
)

Definition at line 514 of file elementwise_quantized_list.h.

521 {
522  // Create input windows
523  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
524  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
525 
526  // Clear X Dimension on execution window as we handle manually
527  Window win = window;
528  win.set(Window::DimX, Window::Dimension(0, 1, 1));
529 
530  const int window_step_x = 16;
531  const auto window_start_x = static_cast<int>(window.x().start());
532  const auto window_end_x = static_cast<int>(window.x().end());
533  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
534 
535  const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
536 
537  const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
538  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
539 
540  if(is_broadcast_across_x)
541  {
542  // Select the broadcast input on the X axis
543  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
544  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
545  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
546  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
547  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
548 
549  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
550  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
551 
552  const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
553  const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
554 
555  // Clear X Dimension on execution window as we handle manually
556  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
557 
558  Iterator broadcast_input(broadcast_tensor, broadcast_win);
559  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
560  Iterator output(out, win);
561 
562  execute_window_loop(win, [&](const Coordinates &)
563  {
564  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
565  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
566 
567  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
568  const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
569 
570  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
571  voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
572  for(; x < window_end_x; ++x)
573  {
574  const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
575  const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
576  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
577  }
578  },
579  broadcast_input, non_broadcast_input, output);
580  }
581  else
582  {
583  const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
584  const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
585 
586  // Input1 quantization info
587  const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
588  const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
589 
590  // Input2 quantization info
591  const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
592  const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
593 
594  // Clear X Dimension on execution window as we handle manually
595  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
596  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
597 
598  Iterator input1(in1, input1_win);
599  Iterator input2(in2, input2_win);
600  Iterator output(out, win);
601 
602  execute_window_loop(win, [&](const Coordinates &)
603  {
604  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
605  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
606  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
607 
608  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
609  vscale1, vscale2, voffseto, invvscaleo);
610  for(; x < window_end_x; ++x)
611  {
612  const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
613  const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
614  *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
615  }
616  },
617  input1, input2, output);
618  }
619 }
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
float dequantize_qasymm8_signed(char input, float offset, float scale)
Dequantize a scalar value from signed 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:75
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

References arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), arm_compute::dequantize_qasymm8_signed(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), arm_compute::vdequantize(), Dimensions< T >::x(), and Window::x().

Referenced by elementwise_arithm_op_quantized_signed().

◆ elementwise_op_scalar_imp()

ScalarType arm_compute::cpu::elementwise_op_scalar_imp ( ElementWiseUnary  op,
const ScalarType &  a 
)
inline

Definition at line 35 of file elementwise_unary_list.h.

36 {
37  switch(op)
38  {
39  case ElementWiseUnary::RSQRT:
40  return 1 / sqrt(a);
41  case ElementWiseUnary::EXP:
42  return std::exp(a);
43  case ElementWiseUnary::NEG:
44  return -a;
45  case ElementWiseUnary::LOG:
46  return std::log(a);
47  case ElementWiseUnary::ABS:
48  return std::abs(a);
49  case ElementWiseUnary::ROUND:
50  return support::cpp11::nearbyint(a);
51  case ElementWiseUnary::SIN:
52  return std::sin(a);
53  default:
54  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
55  }
56 }
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
T nearbyint(T value)
Rounds the floating-point argument arg to an integer value in floating-point format,...

References arm_compute::ABS, ARM_COMPUTE_ERROR, arm_compute::EXP, arm_compute::LOG, arm_compute::support::cpp11::nearbyint(), arm_compute::NEG, arm_compute::ROUND, arm_compute::RSQRT, and arm_compute::SIN.

Referenced by elementwise_op().

◆ fp16_neon_activation()

void arm_compute::cpu::fp16_neon_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_neon_batch_normalization()

void arm_compute::cpu::fp16_neon_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_neon_floor()

void arm_compute::cpu::fp16_neon_floor ( const void *  src,
void *  dst,
int  len 
)

◆ fp16_sve_activation()

void arm_compute::cpu::fp16_sve_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_sve_batch_normalization()

void arm_compute::cpu::fp16_sve_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_sve_scale()

void arm_compute::cpu::fp16_sve_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

◆ fp32_neon_activation()

void fp32_neon_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

SIMD vector tag type.

SIMD vector tag type.

Definition at line 49 of file fp32.cpp.

50 {
51  /** SIMD vector tag type. */
53 
54  constexpr int window_step_x = 4;
55  const auto window_start_x = static_cast<int>(window.x().start());
56  const auto window_end_x = static_cast<int>(window.x().end());
57  const ActivationLayerInfo::ActivationFunction act = act_info.activation();
58 
59  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
60  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
61 
62  Iterator input(src, win_collapsed);
63  Iterator output(dst, win_collapsed);
64 
65  // In case of non-aarch64, a small delta value is added to the input
66  // to prevent NAN values caused by zeros in inputs to SQRT.
67  // In case of aarh64, we call vsqrt directly, so we don't use delta.
68 #ifndef __aarch64__
69  const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
70 #endif /* __aarch64__ */
71  const auto const_1 = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
72  const auto const_0 = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
73  const auto const_6 = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
74  const auto const_3 = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
75  const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
76 
77  constexpr float soft_relu_thresh = 12.f;
78  const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
79 
80  const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
81  const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
82  const auto a = static_cast<float>(act_info.a());
83  const auto b = static_cast<float>(act_info.b());
84  execute_window_loop(win_collapsed, [&](const Coordinates &)
85  {
86  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
87  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
88 
89  wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
90 
91  // Compute S elements per iteration
92  int x = window_start_x;
93  for(; x <= (window_end_x - window_step_x); x += window_step_x)
94  {
95  const auto vin = wrapper::vloadq(input_ptr + x);
96  switch(act)
97  {
98  case ActivationLayerInfo::ActivationFunction::ABS:
99  tmp = wrapper::vabs(vin);
100  break;
101  case ActivationLayerInfo::ActivationFunction::LINEAR:
102  tmp = wrapper::vmla(vb, va, vin);
103  break;
104  case ActivationLayerInfo::ActivationFunction::LOGISTIC:
106  break;
107  case ActivationLayerInfo::ActivationFunction::RELU:
108  tmp = wrapper::vmax(const_0, vin);
109  break;
110  case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
111  tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
112  break;
113  case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
114  tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
115  break;
116  case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
117  tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
118  break;
119  case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
120  tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
121  break;
122  case ActivationLayerInfo::ActivationFunction::ELU:
123  tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
124  break;
125  case ActivationLayerInfo::ActivationFunction::SQRT:
126 #ifdef __aarch64__
127  tmp = wrapper::vsqrt(vin);
128 #else /* __aarch64__ */
129  {
130  const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
131  tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
132  tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
133  }
134 #endif /* __aarch64__ */
135  break;
136  case ActivationLayerInfo::ActivationFunction::SQUARE:
137  tmp = wrapper::vmul(vin, vin);
138  break;
139  case ActivationLayerInfo::ActivationFunction::TANH:
140  tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
141  break;
142  case ActivationLayerInfo::ActivationFunction::IDENTITY:
143  tmp = vin;
144  break;
145  case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
146  tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
147  break;
148  default:
149  ARM_COMPUTE_ERROR("Unsupported activation function");
150  }
151  wrapper::vstore(output_ptr + x, tmp);
152  }
153 
154  // Compute left-over elements
155  for(; x < window_end_x; ++x)
156  {
157  const float in = *(reinterpret_cast<const float *>(input_ptr + x));
158  float tmp;
159  switch(act)
160  {
161  case ActivationLayerInfo::ActivationFunction::ABS:
162  tmp = std::abs(in);
163  break;
164  case ActivationLayerInfo::ActivationFunction::LINEAR:
165  tmp = a * in + b;
166  break;
167  case ActivationLayerInfo::ActivationFunction::LOGISTIC:
168  tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
169  break;
170  case ActivationLayerInfo::ActivationFunction::RELU:
171  tmp = std::max<float>(static_cast<float>(0), in);
172  break;
173  case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
174  tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
175  break;
176  case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
177  tmp = std::min<float>(a, std::max<float>(b, in));
178  break;
179  case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
180  tmp = (in > 0) ? in : a * in;
181  break;
182  case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
183  tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
184  break;
185  case ActivationLayerInfo::ActivationFunction::ELU:
186  tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
187  break;
188  case ActivationLayerInfo::ActivationFunction::SQRT:
189  tmp = std::sqrt(in);
190  break;
191  case ActivationLayerInfo::ActivationFunction::SQUARE:
192  tmp = in * in;
193  break;
194  case ActivationLayerInfo::ActivationFunction::TANH:
195  tmp = a * std::tanh(b * in);
196  break;
197  case ActivationLayerInfo::ActivationFunction::IDENTITY:
198  tmp = in;
199  break;
200  case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
201  tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
202  break;
203  default:
204  ARM_COMPUTE_ERROR("Unsupported activation function");
205  }
206  *(output_ptr + x) = tmp;
207  }
208  },
209  input, output);
210 }
float32x4_t vlog(const float32x4_t &a)
Definition: log.h:47
float32x4_t vtanh(const float32x4_t &a)
Definition: tanh.h:40
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x2_t vinvsqrt(const float32x2_t &a)
Definition: invsqrt.h:47
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
float32x2_t vinv(const float32x2_t &a)
Definition: inv.h:47
int8x8_t vabs(const int8x8_t &a)
Definition: abs.h:46
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
SimpleTensor< float > src
Definition: DFT.cpp:155
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
Definition: traits.h:132
uint8x8_t vnot(const uint8x8_t &a)
Definition: not.h:39
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
int8x8_t vneg(const int8x8_t &a)
Definition: neg.h:39
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: bsl.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint8x8_t vcge(const uint8x8_t &a, const uint8x8_t &b)
Definition: cge.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
Definition: ceq.h:39

References ActivationLayerInfo::a(), ActivationLayerInfo::ABS, ActivationLayerInfo::activation(), ARM_COMPUTE_ERROR, arm_compute::test::validation::b, ActivationLayerInfo::b(), ActivationLayerInfo::BOUNDED_RELU, Window::collapse_if_possible(), Window::DimX, Window::DimZ, arm_compute::test::validation::dst, ActivationLayerInfo::ELU, Window::Dimension::end(), arm_compute::execute_window_loop(), ActivationLayerInfo::HARD_SWISH, ActivationLayerInfo::IDENTITY, arm_compute::test::validation::input, ActivationLayerInfo::LEAKY_RELU, ActivationLayerInfo::LINEAR, ActivationLayerInfo::LOGISTIC, ActivationLayerInfo::LU_BOUNDED_RELU, Iterator::ptr(), ActivationLayerInfo::RELU, Window::set(), ActivationLayerInfo::SOFT_RELU, ActivationLayerInfo::SQRT, ActivationLayerInfo::SQUARE, arm_compute::test::validation::src, Window::Dimension::start(), ActivationLayerInfo::TANH, arm_compute::wrapper::vabs(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vbsl(), arm_compute::wrapper::vceq(), arm_compute::wrapper::vcge(), arm_compute::wrapper::vcgt(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vinv(), arm_compute::wrapper::vinvsqrt(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vlog(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmin(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmul(), arm_compute::wrapper::vneg(), arm_compute::wrapper::vnot(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vsub(), arm_compute::wrapper::vtanh(), and Window::x().

◆ fp32_neon_batch_normalization()

void fp32_neon_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

Definition at line 135 of file fp32.cpp.

137 {
138  if(act_info.enabled())
139  {
140  fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
141  }
142  else
143  {
144  batch_normalization<detail::dummy<float, 4>>(src, dst, mean, var, beta, gamma, epsilon, act_info, window);
145  }
146 }
SimpleTensor< float > src
Definition: DFT.cpp:155

References ActivationLayerInfo::activation(), arm_compute::test::validation::dst, ActivationLayerInfo::enabled(), arm_compute::quantization::epsilon, and arm_compute::test::validation::src.

◆ fp32_neon_floor()

void fp32_neon_floor ( const void *  src,
void *  dst,
int  len 
)

Definition at line 37 of file fp32.cpp.

38 {
41  ARM_COMPUTE_ASSERT(len >= 0);
42 
43  auto psrc = static_cast<const float *>(src);
44  auto pdst = static_cast<float *>(dst);
45 
46  for(; len >= step; len -= step)
47  {
48  vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
49  psrc += step;
50  pdst += step;
51  }
52 
53  for(; len > 0; --len)
54  {
55  *pdst = std::floor(*psrc);
56  ++pdst;
57  ++psrc;
58  }
59 }
#define ARM_COMPUTE_ASSERT(cond)
Definition: Validate.h:37
SimpleTensor< float > src
Definition: DFT.cpp:155
float32x4_t vfloorq_f32(float32x4_t val)
Calculate floor of a vector.
constexpr int step
Definition: fp32.cpp:35
#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr)
Definition: Validate.h:38

References ARM_COMPUTE_ASSERT, ARM_COMPUTE_ASSERT_NOT_NULLPTR, arm_compute::test::validation::dst, arm_compute::test::validation::src, step, and arm_compute::vfloorq_f32().

◆ fp32_sve_activation()

void arm_compute::cpu::fp32_sve_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

◆ fp32_sve_batch_normalization()

void arm_compute::cpu::fp32_sve_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

◆ fp32_sve_scale()

void arm_compute::cpu::fp32_sve_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

◆ load_quantized()

float32x4x4_t arm_compute::cpu::load_quantized ( const uint8_t *  input1_ptr,
const int32x4_t &  offset,
const float32x4_t &  scale 
)

Definition at line 33 of file elementwise_quantized_list.h.

34 {
35  qasymm8x16_t x = vld1q_u8(input1_ptr);
36  const float32x4x4_t out =
37  {
38  {
39  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
40  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
41  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
42  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
43  }
44  };
45  return out;
46 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
uint8x16_t qasymm8x16_t
8 bit quantized asymmetric vector with 16 elements
Definition: NEAsymm.h:37

References offset(), and arm_compute::test::validation::scale.

Referenced by elementwise_arithm_op_quantized_broadcast_loop(), elementwise_arithm_op_quantized_loop(), elementwise_comp_op_quantized_broadcast_loop(), and elementwise_comp_op_quantized_loop().

◆ load_quantized_signed()

float32x4x4_t arm_compute::cpu::load_quantized_signed ( const int8_t *  input1_ptr,
const int32x4_t &  offset,
const float32x4_t &  scale 
)

Definition at line 48 of file elementwise_quantized_list.h.

49 {
50  qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
51  const float32x4x4_t out =
52  {
53  {
54  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
55  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
56  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
57  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
58  }
59  };
60  return out;
61 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
int8x16_t qasymm8x16_signed_t
8 bit quantized signed asymmetric vector with 16 elements
Definition: NEAsymm.h:43

References offset(), and arm_compute::test::validation::scale.

Referenced by elementwise_arithm_op_quantized_signed_broadcast_loop(), elementwise_arithm_op_quantized_singed_loop(), elementwise_comp_op_quantized_signed_broadcast_loop(), and elementwise_comp_op_quantized_signed_loop().

◆ nearest_neon_scale()

void arm_compute::cpu::nearest_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 51 of file list.h.

53 {
54  const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
55  const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
56  const size_t in_stride_wc = in_stride_w * in_stride_c;
57  const size_t in_dim_h = src->info()->dimension(2);
58 
59  // Compute the ratio between source height and destination height
60  const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
61  const auto window_start_x = static_cast<int32_t>(window.x().start());
62  const auto window_end_x = static_cast<int32_t>(window.x().end());
63  const int window_step_x = 16 / sizeof(T);
64 
65  Window win(window);
66  win.set(Window::DimX, Window::Dimension(0, 1, 1));
67  Iterator out(dst, win);
68 
69  const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
70  const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
71 
72  execute_window_loop(win, [&](const Coordinates & id)
73  {
74  const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
75  const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
76  const int offset_row = in_hi * in_stride_wc;
77  int32_t x = window_start_x;
78  const T *in_ptr = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
79 
80  for(; x <= window_end_x - window_step_x; x += window_step_x)
81  {
82  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
83  wrapper::vloadq(in_ptr + offset + offset_row + x));
84  }
85  for(; x < window_end_x; ++x)
86  {
87  *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
88  }
89  },
90  out);
91 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
T round_half_away_from_zero(T value)
Round floating-point value with half value rounding away from zero.
Definition: Rounding.h:106
SimpleTensor< float > src
Definition: DFT.cpp:155
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners=false)
Returns resize ratio between input and output with consideration of aligned corners.
Definition: ScaleUtils.cpp:27

References arm_compute::scale_utils::calculate_resize_ratio(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), offset(), Iterator::ptr(), ITensor::ptr_to_element(), arm_compute::utils::rounding::round_half_away_from_zero(), Window::set(), arm_compute::test::validation::src, Window::Dimension::start(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vstore(), and Window::x().

◆ neon_logits_1d_max()

void arm_compute::cpu::neon_logits_1d_max ( const ITensor in,
ITensor out,
const Window window 
)

SIMD vector tag type.

Definition at line 37 of file list.h.

38 {
39  /** SIMD vector tag type. */
40  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
41 
42  constexpr int window_step_x = 16 / sizeof(T);
43  const auto window_start_x = static_cast<int>(window.x().start());
44  const auto window_end_x = static_cast<int>(window.x().end());
45 
46  Window win{ window };
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48  Iterator input(in, win);
49  Iterator output(out, win);
50 
51  const int sum_stages = log2(window_step_x / 2);
52  execute_window_loop(win, [&](const Coordinates &)
53  {
54  // Get pointers
55  const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
56  const auto out_ptr = reinterpret_cast<T *>(output.ptr());
57 
58  // Init max value
59  auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
60  int x = window_start_x;
61 
62  for(; x <= (window_end_x - window_step_x); x += window_step_x)
63  {
64  const auto current_value = wrapper::vloadq(in_ptr + x);
65  vec_max = wrapper::vmax(vec_max, current_value);
66  }
67  auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
68 
69  for(int i = 0; i < sum_stages; ++i)
70  {
71  carry_max = wrapper::vpmax(carry_max, carry_max);
72  }
73  T max_val = wrapper::vgetlane(carry_max, 0);
74 
75  // Compute left-over elements
76  for(; x < window_end_x; ++x)
77  {
78  max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
79  }
80 
81  *out_ptr = max_val;
82  },
83  input, output);
84 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39

References Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), arm_compute::test::validation::input, Iterator::ptr(), Window::set(), Window::Dimension::start(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vpmax(), and Window::x().

◆ neon_softmax_logits_1d_float()

void arm_compute::cpu::neon_softmax_logits_1d_float ( const ITensor in,
const ITensor max,
void *const  tmp,
ITensor out,
const float  beta,
bool  is_log,
const Window window 
)

SIMD vector tag type.

Definition at line 260 of file list.h.

262 {
263  const int start_x = in->info()->valid_region().anchor.x();
264  const int input_width = in->info()->valid_region().shape.x();
265 
266  Iterator in_it(in, window);
267  Iterator max_it(max, window);
268  Iterator out_it(out, window);
269 
270  /** SIMD vector tag type. */
271  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
272 
273  constexpr int vec_size = 16 / sizeof(T);
274  const int sum_stages = log2(vec_size / 2);
275 
276  execute_window_loop(window, [&](const Coordinates &)
277  {
278  /* Get pointers */
279  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
280  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
281  const auto tmp_ptr = reinterpret_cast<T *>(tmp);
282 
283  T sum{};
284  T sum_inversed{};
285 
286  /* Compute exponentials and sum */
287  {
288  /* Get max value */
289  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
290  const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
291 
292  /* Init sum to zero */
293  auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
294 
295  /* Loop over row and compute exponentials and sum */
296  int x = 0;
297  for(; x <= (input_width - vec_size); x += vec_size)
298  {
299  auto vec_elements = wrapper::vloadq(in_ptr + x);
300  vec_elements = wrapper::vsub(vec_elements, vec_max);
301  if(is_log)
302  {
303  vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
304  vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
305  }
306  else
307  {
308  vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
309  vec_sum = wrapper::vadd(vec_sum, vec_elements);
310  }
311  wrapper::vstore(tmp_ptr + x, vec_elements);
312  }
313 
314  /* Reduce sum */
315  auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
316  for(int i = 0; i < sum_stages; ++i)
317  {
318  sum_res = wrapper::vpadd(sum_res, sum_res);
319  }
320  sum = wrapper::vgetlane(sum_res, 0);
321 
322  /* Run remaining elements */
323  for(; x < input_width; ++x)
324  {
325  T element{};
326 
327  if(is_log)
328  {
329  element = (in_ptr[x] - max_val) * beta;
330  sum += std::exp(element);
331  }
332  else
333  {
334  element = std::exp((in_ptr[x] - max_val) * beta);
335  sum += element;
336  }
337  tmp_ptr[x] = element;
338  }
339 
340  if(!is_log)
341  {
342  sum_inversed = T(1) / sum;
343  }
344  else
345  {
346  sum = static_cast<T>(std::log(sum));
347  }
348  }
349 
350  /* Normalize exponentials */
351  {
352  /* Loop over row and compute softmax */
353  int x = 0;
354  for(; x <= (input_width - vec_size); x += vec_size)
355  {
356  auto vec_in = wrapper::vloadq(tmp_ptr + x);
357  auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
358  if(is_log)
359  {
360  normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
361  }
362  else
363  {
364  normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
365  }
366  wrapper::vstore(out_ptr + x, normalized_value);
367  }
368  /* Run remaining elements */
369  for(; x < input_width; ++x)
370  {
371  if(is_log)
372  {
373  out_ptr[x] = tmp_ptr[x] - sum;
374  }
375  else
376  {
377  out_ptr[x] = tmp_ptr[x] * sum_inversed;
378  }
379  }
380  }
381  },
382  in_it, max_it, out_it);
383 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
const size_t input_width
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47

References ValidRegion::anchor, arm_compute::execute_window_loop(), ITensor::info(), input_width, Iterator::ptr(), ValidRegion::shape, arm_compute::wrapper::vadd(), ITensorInfo::valid_region(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmul(), arm_compute::wrapper::vpadd(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vsub(), and Dimensions< T >::x().

◆ neon_softmax_logits_1d_quantized()

void arm_compute::cpu::neon_softmax_logits_1d_quantized ( const ITensor in,
const ITensor max,
void *const  tmp,
ITensor out,
float  beta,
bool  is_log,
const Window window 
)

Definition at line 87 of file list.h.

89 {
90  static_assert(std::is_same<T, qasymm8_t>::value
91  || std::is_same<T, qasymm8_signed_t>::value,
92  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
93 
94  const int start_x = in->info()->valid_region().anchor.x();
95  const int input_width = in->info()->valid_region().shape.x();
96 
97  const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
98  const auto scale_beta_vec = vdupq_n_f32(scale_beta);
99 
100  Iterator in_it(in, window);
101  Iterator max_it(max, window);
102  Iterator out_it(out, window);
103  constexpr int vec_size = 16;
104 
105  execute_window_loop(window, [&](const Coordinates &)
106  {
107  /* Get pointers */
108  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
109  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
110  const auto tmp_ptr = reinterpret_cast<float *>(tmp);
111 
112  float sum{};
113  float sum_inversed{};
114 
115  /* Compute exponentials and sum */
116  {
117  /* Get max value */
118  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
119  const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
120 
121  /* Init sum to zero */
122  float32x4x4_t vec_sum =
123  {
124  vdupq_n_f32(0.f),
125  vdupq_n_f32(0.f),
126  vdupq_n_f32(0.f),
127  vdupq_n_f32(0.f),
128  };
129 
130  /* Loop over row and compute exponentials and sum */
131  int x = 0;
132  for(; x <= (input_width - vec_size); x += vec_size)
133  {
134  auto vec_elements = wrapper::vloadq(in_ptr + x);
135  vec_elements = wrapper::vqsub(vec_max, vec_elements);
136  auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
137 
138  if(is_log)
139  {
140  vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
141  vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
142  vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
143  vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
144  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
145  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
146  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
147  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
148  }
149  else
150  {
151  vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
152  vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
153  vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
154  vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
155  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
156  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
157  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
158  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
159  }
160 
161  vst4q_f32(tmp_ptr + x, vec_elements_flt);
162  }
163 
164  /* Reduce sum */
165  const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
166  auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
167  sum_res = vpadd_f32(sum_res, sum_res);
168  sum = wrapper::vgetlane(sum_res, 0);
169 
170  /* Run remaining elements */
171  for(; x < input_width; ++x)
172  {
173  float element{};
174  if(is_log)
175  {
176  element = (max_val - in_ptr[x]) * scale_beta;
177  sum += std::exp(element);
178  }
179  else
180  {
181  element = std::exp((max_val - in_ptr[x]) * scale_beta);
182  sum += element;
183  }
184 
185  tmp_ptr[x] = element;
186  }
187 
188  if(!is_log)
189  {
190  sum_inversed = 256.f / sum;
191  }
192  else
193  {
194  sum = std::log(sum);
195  }
196  }
197 
198  /* Normalize exponentials */
199  {
200  constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
201  /* Loop over row and compute softmax */
202  int x = 0;
203  for(; x <= (input_width - vec_size); x += vec_size)
204  {
205  using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
206  float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
207  int_vec_type normalized_value{};
208  if(is_log)
209  {
210  const float32x4x4_t sub =
211  {
212  vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
213  vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
214  vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
215  vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
216  };
217  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
218  }
219  else
220  {
221  float32x4x4_t mul =
222  {
223  vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
224  vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
225  vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
226  vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
227  };
228 
229  if(is_qasymm8_signed)
230  {
231  const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
232  mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
233  mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
234  mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
235  mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
236  }
237 
238  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
239  }
240  wrapper::vstore(out_ptr + x, normalized_value);
241  }
242  /* Run remaining elements */
243  for(; x < input_width; ++x)
244  {
245  if(is_log)
246  {
247  out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
248  }
249  else
250  {
251  out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
252  }
253  }
254  }
255  },
256  in_it, max_it, out_it);
257 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
const size_t input_width
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:74
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

References ValidRegion::anchor, arm_compute::execute_window_loop(), ITensor::info(), input_width, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, ValidRegion::shape, QuantizationInfo::uniform(), ITensorInfo::valid_region(), arm_compute::wrapper::vdup_n(), arm_compute::vexpq_f32(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vqsub(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vsub(), and Dimensions< T >::x().

◆ offset_no_padding()

uint32_t arm_compute::cpu::offset_no_padding ( uint32_t  padded_offset,
const Coordinates id,
const ITensorInfo info,
int  pool_stride_x,
int  pool_stride_y,
DataLayout  data_layout 
)
inline

Definition at line 62 of file list.h.

63 {
64  const int pad_left = info.padding().left;
65  const int pad_right = info.padding().right;
66  const int pad_top = info.padding().top;
67  const int pad_bottom = info.padding().bottom;
68  const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
69  const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
70  const int pad_horiz = pad_left + pad_right;
71  const int pad_vert = pad_top + pad_bottom;
72 
73  if(data_layout == DataLayout::NCHW)
74  {
75  const uint32_t offset_base = padded_offset
76  - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
77  - pad_top * sizeof(T) /* top padding */
78  - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
79  - in_stride_w * id[3];
80 
81  return offset_base;
82  }
83  else
84  {
85  const uint32_t offset_base = padded_offset
86  - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
87  - pad_top * sizeof(T) // top padding
88  - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
89  - in_stride_w * id[3];
90 
91  return offset_base;
92  }
93 }
const DataLayout data_layout
Definition: Im2Col.cpp:151
int pool_stride_x
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)

References arm_compute::test::validation::data_layout, arm_compute::test::validation::info, arm_compute::NCHW, and pool_stride_x.

◆ poolingMxN_fp16_neon_nhwc()

void arm_compute::cpu::poolingMxN_fp16_neon_nhwc ( const ITensor src0,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo ,
const Window window_src,
const Window window 
)

◆ poolingMxN_fp32_neon_nhwc()

void poolingMxN_fp32_neon_nhwc ( const ITensor src,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 146 of file fp32.cpp.

147 {
148  if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
149  {
150  pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
151  }
152  else
153  {
154  const int window_start_x = window.x().start();
155  const int window_end_x = window.x().end();
156  const int window_step_x = 4;
157 
158  Window window_out = window;
159  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
160 
161  Iterator in(src, window_src);
162  Iterator out(dst0, window_out);
163 
164  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
165  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
166  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
167  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
168  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
169  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
170  int pool_stride_x = 0;
171  int pool_stride_y = 0;
172  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
173  const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
174  const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
175 
176  float32x4_t vres;
177 
178  execute_window_loop(window_out, [&](const Coordinates & id)
179  {
180  const int idx_width = id.y() * pool_stride_x;
181  const int idx_height = id.z() * pool_stride_y;
182  const int pool_limit_y = pool_pad_top - idx_height;
183  const int pool_limit_x = pool_pad_left - idx_width;
184 
185  const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
186  const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
187  const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
188  const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
189 
190  int x_off = window_start_x;
191  for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
192  {
193  if(pool_info.pool_type != PoolingType::MAX)
194  {
195  // Calculate scale
196  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
197  pool_stride_y);
198  const float32x4_t scale_v = vdupq_n_f32(scale);
199 
200  // Perform pooling
201  vres = vdupq_n_f32(0.0f);
202 
203  for(int y = pool_start_y; y < pool_end_y; ++y)
204  {
205  for(int x = pool_start_x; x < pool_end_x; ++x)
206  {
207  const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
208  (src->info()->strides_in_bytes().z())) + x_off);
209 
210  // Get power of 2 in case of l2 pooling and accumulate
211  if(pool_info.pool_type == PoolingType::L2)
212  {
213  vres = vmlaq_f32(vres, data, data);
214  }
215  else
216  {
217  vres = vaddq_f32(vres, data);
218  }
219  }
220  }
221  // Divide by scale
222  vres = vmulq_f32(vres, scale_v);
223  }
224  else
225  {
226  vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
227  for(int y = pool_start_y; y < pool_end_y; ++y)
228  {
229  for(int x = pool_start_x; x < pool_end_x; ++x)
230  {
231  const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
232  (src->info()->strides_in_bytes().z())) + x_off);
233  vres = vmaxq_f32(vres, data);
234  }
235  }
236  }
237 
238  // Calculate square-root in case of l2 pooling
239  if(pool_info.pool_type == PoolingType::L2)
240  {
241  float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
242  static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
243  static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
244  static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
245  };
246  vres = l2_res;
247  }
248 
249  // Store result
250  vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
251  }
252 
253  // Left-overs loop
254  for(; x_off < window_end_x; ++x_off)
255  {
256  float res = 0.0f;
257 
258  if(pool_info.pool_type != PoolingType::MAX)
259  {
260  // Calculate scale
261  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
262  pool_stride_y);
263 
264  for(int y = pool_start_y; y < pool_end_y; ++y)
265  {
266  for(int x = pool_start_x; x < pool_end_x; ++x)
267  {
268  const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
269  (src->info()->strides_in_bytes().z())) + x_off);
270 
271  // Get power of 2 in case of l2 pooling and accumulate
272  if(pool_info.pool_type == PoolingType::L2)
273  {
274  res += data * data;
275  }
276  else
277  {
278  res += data;
279  }
280  }
281  }
282 
283  // Divide by scale
284  res *= scale;
285  }
286  else
287  {
289  for(int y = pool_start_y; y < pool_end_y; ++y)
290  {
291  for(int x = pool_start_x; x < pool_end_x; ++x)
292  {
293  const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
294  (src->info()->strides_in_bytes().z())) + x_off);
295  res = std::max(res, data);
296  }
297  }
298  }
299 
300  // Calculate square-root in case of l2 pooling
301  if(pool_info.pool_type == PoolingType::L2)
302  {
303  res = std::sqrt(res);
304  }
305 
306  // Store result
307  *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
308  }
309  },
310  in, out);
311  }
312 }
#define MAX(x, y)
SimpleTensor< float > src
Definition: DFT.cpp:155
int pool_stride_x
ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

References calculate_avg_scale(), Window::DimX, Window::Dimension::end(), PoolingLayerInfo::exclude_padding, arm_compute::execute_window_loop(), Size2D::height, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, PoolingLayerInfo::is_global_pooling, arm_compute::L2, arm_compute::support::cpp11::lowest(), arm_compute::MAX, arm_compute::NHWC, PadStrideInfo::pad_bottom(), PadStrideInfo::pad_left(), PadStrideInfo::pad_right(), PoolingLayerInfo::pad_stride_info, PadStrideInfo::pad_top(), PoolingLayerInfo::pool_size, pool_stride_x, PoolingLayerInfo::pool_type, Iterator::ptr(), arm_compute::test::validation::scale, Window::set(), arm_compute::test::validation::src, Window::Dimension::start(), PadStrideInfo::stride(), Size2D::width, Window::x(), Window::y(), and Window::z().

◆ poolingMxN_q8_neon_nhwc()

void arm_compute::cpu::poolingMxN_q8_neon_nhwc ( const ITensor src,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 182 of file quantized.h.

183 {
184  ARM_COMPUTE_UNUSED(dst1);
185 
186  const int window_start_x = window.x().start();
187  const int window_end_x = window.x().end();
188  const int window_step_x = 16;
189  const int window_half_step_x = window_step_x / 2;
190 
191  Window window_out = window;
192  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
193 
194  Iterator in(src, window_src);
195  Iterator out(dst0, window_out);
196 
197  using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
198  using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
199  using q16_t = typename wrapper::traits::promote_t<T>;
200  using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
201  using q32_t = typename wrapper::traits::promote_t<q16_t>;
202  using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
203 
204  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
205  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
206  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
207  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
208  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
209  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
210 
211  int pool_stride_x = 0;
212  int pool_stride_y = 0;
213  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
214  const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
215  const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
216 
217  const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
218  const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
219  const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
220 
221  const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
222  // "new_offset" doesn't have to consider the "half_scale_v" in its computation
223  // With a requantization performed in a single step there won't be uncertainties introduced
224  const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
225 
226  const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
227  const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
228  const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
229 
230  execute_window_loop(window_out, [&](const Coordinates & id)
231  {
232  const int idx_width = id.y() * pool_stride_x;
233  const int idx_height = id.z() * pool_stride_y;
234  const int pool_limit_y = pool_pad_top - idx_height;
235  const int pool_limit_x = pool_pad_left - idx_width;
236 
237  const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
238  const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
239  const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
240  const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
241 
242  int x_off = window_start_x;
243  for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
244  {
245  if(pool_info.pool_type != PoolingType::MAX)
246  {
247  q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
248  q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
249  q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
250  q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
251 
252  // Calculate scale
253  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
254  pool_stride_y);
255 
256  // Perform pooling
257  for(int y = pool_start_y; y < pool_end_y; ++y)
258  {
259  for(int x = pool_start_x; x < pool_end_x; ++x)
260  {
261  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
262  (src->info()->strides_in_bytes().z())) + x_off);
263 
264  const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
265  const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
266  vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
267  vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
268  vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
269  vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
270  }
271  }
272 
273  if(src_qinfo != dst_qinfo)
274  {
275  const float32x4x4_t vres =
276  {
277  {
278  vcvtq_f32_q32(vres1),
279  vcvtq_f32_q32(vres2),
280  vcvtq_f32_q32(vres3),
281  vcvtq_f32_q32(vres4),
282  }
283  };
284  const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
285  // Store result
286  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
287  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
288  }
289  else
290  {
291  const float32x4_t scale_v = vdupq_n_f32(scale);
292  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
293  vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
294  vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
295  vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
296  vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
297 
298  const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
299  const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
300  // Store result
301  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
302  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
303  }
304  }
305  else
306  {
307  q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
308 
309  for(int y = pool_start_y; y < pool_end_y; ++y)
310  {
311  for(int x = pool_start_x; x < pool_end_x; ++x)
312  {
313  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
314  (src->info()->strides_in_bytes().z())) + x_off);
315  vres = wrapper::vmax(vres, data);
316  }
317  }
318 
319  // Store result
320  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
321  requant_qinfo) :
322  vres);
323  }
324  }
325 
326  if(pool_info.pool_type == PoolingType::MAX)
327  {
328  for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
329  {
330  q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
331  for(int y = pool_start_y; y < pool_end_y; ++y)
332  {
333  for(int x = pool_start_x; x < pool_end_x; ++x)
334  {
335  const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
336  (src->info()->strides_in_bytes().z())) + x_off);
337  vres = wrapper::vmax(vres, data);
338  }
339  }
340 
341  // Store result
342  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
343  (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
344  }
345  }
346 
347  // Left-overs loop
348  for(; x_off < window_end_x; ++x_off)
349  {
350  if(pool_info.pool_type != PoolingType::MAX)
351  {
352  q32_t res = static_cast<q32_t>(0.f);
353 
354  // Calculate scale
355  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
356  pool_stride_y);
357 
358  // Perform pooling
359  for(int y = pool_start_y; y < pool_end_y; ++y)
360  {
361  for(int x = pool_start_x; x < pool_end_x; ++x)
362  {
363  const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
364  (src->info()->strides_in_bytes().z())) + x_off);
365  res += data;
366  }
367  }
368 
369  if(src_qinfo != dst_qinfo)
370  {
371  const float res_f = static_cast<float>(res);
372  const float new_scale = quant_rescale / scale;
373  const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
374 
375  // Store result
376  *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
377  }
378  else
379  {
380  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
381  res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
382 
383  // Store result
384  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
385  }
386  }
387  else
388  {
389  T res = std::numeric_limits<T>::min();
390 
391  for(int y = pool_start_y; y < pool_end_y; ++y)
392  {
393  for(int x = pool_start_x; x < pool_end_x; ++x)
394  {
395  const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
396  (src->info()->strides_in_bytes().z())) + x_off);
397  res = std::max(res, data);
398  }
399  }
400 
401  // Store result
402  if(src_qinfo != dst_qinfo)
403  {
404  const float res_f = static_cast<float>(res);
405  *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
406  }
407  else
408  {
409  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
410  }
411  }
412  }
413 
414  },
415  in, out);
416 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
#define MAX(x, y)
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Definition: DFT.cpp:155
int pool_stride_x
ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float32x4_t vcvtq_f32_q32(int32x4_t values)
Definition: quantized.h:78
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39

References ARM_COMPUTE_UNUSED, calculate_avg_scale(), Window::DimX, Window::Dimension::end(), PoolingLayerInfo::exclude_padding, arm_compute::execute_window_loop(), Size2D::height, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, ITensor::info(), PoolingLayerInfo::is_global_pooling, arm_compute::MAX, arm_compute::NHWC, UniformQuantizationInfo::offset, PadStrideInfo::pad_bottom(), PadStrideInfo::pad_left(), PadStrideInfo::pad_right(), PoolingLayerInfo::pad_stride_info, PadStrideInfo::pad_top(), PoolingLayerInfo::pool_size, pool_stride_x, PoolingLayerInfo::pool_type, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, arm_compute::test::validation::scale, Window::set(), arm_compute::test::validation::src, Window::Dimension::start(), PadStrideInfo::stride(), QuantizationInfo::uniform(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vcombine(), vcvtq_f32_q32(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vload(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vmovn(), arm_compute::wrapper::vstore(), Size2D::width, Window::x(), Window::y(), and Window::z().

◆ poolingMxN_qasymm8_neon_nhwc()

void poolingMxN_qasymm8_neon_nhwc ( const ITensor src0,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 36 of file qasymm8.cpp.

37 {
38  poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
39 }
SimpleTensor< float > src
Definition: DFT.cpp:155

References arm_compute::test::validation::src.

◆ poolingMxN_qasymm8_signed_neon_nhwc()

void poolingMxN_qasymm8_signed_neon_nhwc ( const ITensor src0,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 36 of file qasymm8_signed.cpp.

37 {
38  poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
39 }
SimpleTensor< float > src
Definition: DFT.cpp:155

References arm_compute::test::validation::src.

◆ qasymm8_neon_activation()

void qasymm8_neon_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

Definition at line 39 of file qasymm8.cpp.

40 {
41  constexpr int window_step_x = 16;
42  const auto window_start_x = static_cast<int>(window.x().start());
43  const auto window_end_x = static_cast<int>(window.x().end());
44  const ActivationLayerInfo::ActivationFunction act = act_info.activation();
45 
46  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
47  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  Iterator input(src, win_collapsed);
50  Iterator output(dst, win_collapsed);
51 
52  const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
53  const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
54  const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
55  const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
56  const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
57  const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
58  const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
59  const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
60  const auto vconst_1 = vdupq_n_f32(1.f);
61 #ifndef __aarch64__
62  const auto vconst_0_f32 = vdupq_n_f32(0);
63 #endif // __aarch64__
64  const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
65  const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
66  const float a_f32 = act_info.a();
67  const float b_f32 = act_info.b();
68  const auto const_6_f32 = vdupq_n_f32(6.f);
69  const auto const_0_f32 = vdupq_n_f32(0.f);
70  const auto const_3_f32 = vdupq_n_f32(3.f);
71  const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
72 
73  // Initialise scale/offset for re-quantization
74  float s = qi_in.scale / qi_out.scale;
75  float o = -qi_in.offset * s + qi_out.offset;
76  float32x4_t vs = vdupq_n_f32(s);
77  float32x4_t vo = vdupq_n_f32(o);
78 
79  execute_window_loop(win_collapsed, [&](const Coordinates &)
80  {
81  const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
82  const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
83 
84  wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
85 
86  // Compute S elements per iteration
87  int x = window_start_x;
88  for(; x <= (window_end_x - window_step_x); x += window_step_x)
89  {
90  const auto vin = wrapper::vloadq(input_ptr + x);
91  if(act == ActivationLayerInfo::ActivationFunction::RELU)
92  {
93  // Perform activation
94  tmp = vmaxq_u8(vconst_0, vin);
95  // Re-quantize to new output space
96  tmp = vmlaq_qasymm8(tmp, vs, vo);
97  }
98  else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
99  {
100  // Perform activation
101  tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
102  // Re-quantize to new output space
103  tmp = vmlaq_qasymm8(tmp, vs, vo);
104  }
105  else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
106  {
107  // Perform activation
108  tmp = vminq_u8(va, vmaxq_u8(vb, vin));
109  // Re-quantize to new output space
110  tmp = vmlaq_qasymm8(tmp, vs, vo);
111  }
112  else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
113  {
114  // De-quantize
115  const auto vin_deq = vdequantize(vin, qi_in);
116  // Perform activation
117  const float32x4x4_t tmp_dep =
118  {
119  {
120  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
121  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
122  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
123  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
124  }
125  };
126  // Re-quantize to new output space
127  tmp = vquantize(tmp_dep, qi_out);
128  }
129  else if(act == ActivationLayerInfo::ActivationFunction::TANH)
130  {
131  // De-quantize
132  const auto vin_deq = vdequantize(vin, qi_in);
133  // Perform activation
134  const float32x4x4_t tmp_dep =
135  {
136  {
137  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
138  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
139  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
140  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
141  }
142  };
143  // Re-quantize to new output space
144  tmp = vquantize(tmp_dep, qi_out);
145  }
146  else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
147  {
148  // De-quantize
149  const auto vin_deq = vdequantize(vin, qi_in);
150  // Perform activation
151  const float32x4x4_t tmp_dep =
152  {
153  {
154  wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
155  wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
156  wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
157  wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
158  }
159  };
160  // Re-quantize to new output space
161  tmp = vquantize(tmp_dep, qi_out);
162  }
163  else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
164  {
165  const auto vin_deq = vdequantize(vin, qi_in);
166 
167 #ifdef __aarch64__
168  const uint32x4x4_t pos_mask =
169  {
170  {
171  wrapper::vcgtz(vin_deq.val[0]),
172  wrapper::vcgtz(vin_deq.val[1]),
173  wrapper::vcgtz(vin_deq.val[2]),
174  wrapper::vcgtz(vin_deq.val[3]),
175  }
176  };
177 #else // __aarch64__
178  const uint32x4x4_t pos_mask =
179  {
180  {
181  wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
182  wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
183  wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
184  wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
185  }
186  };
187 #endif // __aarch64__
188 
189  const float32x4x4_t tmp_dep =
190  {
191  {