Compute Library
 21.08
arm_compute::cpu Namespace Reference

Namespaces

 kernel
 
 kernels
 

Data Structures

struct  AsmGemmInfo
 
class  CpuActivation
 Basic function to run kernels::CpuActivationKernel. More...
 
class  CpuAdd
 Basic function to run kernels::CpuAddKernel. More...
 
class  CpuAuxTensorHandler
 
struct  CpuCapabilities
 Structure that encodes the CPU capabilities to be used. More...
 
class  CpuCast
 Basic function to run kernels::CpuCastKernel. More...
 
class  CpuComplexMul
 Basic function to run kernels::CpuComplexMulKernel. More...
 
class  CpuConcatenate
 Basic function to execute concatenate tensors along a given axis. More...
 
class  CpuContext
 CPU context implementation class. More...
 
class  CpuConv2d
 Basic function to simulate a convolution layer. More...
 
class  CpuConvertFullyConnectedWeights
 Basic function to run kernels::CpuConvertFullyConnectedWeightsKernel. More...
 
class  CpuCopy
 Basic function to run kernels::CpuCopyKernel. More...
 
class  CpuDepthwiseConv2d
 Function to execute a depthwise convolution. More...
 
class  CpuDepthwiseConv2dAssemblyDispatch
 Depthwise convolution assembly kernel glue. More...
 
class  CpuDequantize
 Basic function to run kernels::CpuDequantizeKernel that dequantizes an input tensor. More...
 
class  CpuDirectConv2d
 Function to run the direct convolution. More...
 
class  CpuElementwiseArithmetic
 Class to run cpu::kernels::CpuArithmeticKernel except for division and power. More...
 
class  CpuElementwiseBase
 
class  CpuElementwiseComparison
 Basic function to run cpu::kernels::CpuComparisonKernel. More...
 
class  CpuElementwiseComparisonStatic
 Basic function to run cpu::kernels::CpuComparisonKernel. More...
 
class  CpuElementwiseDivision
 Basic function to run cpu::kernels::CpuArithmeticKernel for division. More...
 
class  CpuElementwisePower
 Basic function to run cpu::kernels::CpuArithmeticKernel for power. More...
 
class  CpuElementwiseUnary
 
class  CpuFill
 Basic function to run kernels::CpuFillKernel. More...
 
class  CpuFlatten
 Basic function to flatten a given input. More...
 
class  CpuFloor
 Basic function to run kernels::CpuFloorKernel. More...
 
class  CpuFullyConnected
 Basic function to compute a Fully Connected layer. More...
 
class  CpuGemm
 Basic function to execute GEMM. More...
 
class  CpuGemmAssemblyDispatch
 Assembly kernel glue. More...
 
class  CpuGemmConvolution
 Basic function to compute the convolution layer. More...
 
class  CpuGemmDirectConv2d
 
class  CpuGemmLowpMatrixMultiplyCore
 Basic function to execute GEMMLowpMatrixMultiplyCore. More...
 
class  CpuGemmLowpOutputStage
 Basic function to execute GEMMLowpQuantizeDown kernels. More...
 
class  CpuLogits1DSoftmaxKernel
 
class  CpuMul
 Basic function to run kernels::CpuMulKernel. More...
 
class  CpuPermute
 Basic function to run kernels::CpuPermuteKernel. More...
 
class  CpuPool2d
 Basic function to simulate a pooling layer with the specified pooling operation. More...
 
class  CpuQuantize
 Basic function to run kernels::CpuQuantizeKernel that dequantizes an input tensor. More...
 
class  CpuQueue
 CPU queue implementation class. More...
 
class  CpuReshape
 Basic function to run kernels::CpuReshapeKernel. More...
 
class  CpuScale
 Basic function to compute Scale. More...
 
class  CpuSoftmaxGeneric
 Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. More...
 
class  CpuSub
 Basic function to run kernels::CpuSubKernel. More...
 
class  CpuTensor
 CPU tensor implementation class. More...
 
class  CpuTranspose
 Basic function to run kernels::CpuTransposeKernel. More...
 
class  CpuWinogradConv2d
 
class  CpuWinogradConv2dConfiguration
 Kernel to perform Winograd. More...
 
class  CpuWinogradConv2dTransformInputKernel
 Kernel to perform Winograd input transform. More...
 
class  CpuWinogradConv2dTransformOutputKernel
 Kernel to perform Winograd output transform. More...
 
class  CpuWinogradConv2dTransformWeightsKernel
 Kernel to perform Winograd weights transform. More...
 
class  ICpuWinogradConv2dTransformInputKernel
 Interface for the kernel to perform Winograd input transform. More...
 
class  ICpuWinogradConv2dTransformOutputKernel
 Interface for the kernel to perform Winograd output transform. More...
 
class  ICpuWinogradConv2dTransformWeightsKernel
 Interface for the kernel to perform Winograd weights transform. More...
 

Typedefs

using ICpuKernel = arm_compute::ICPPKernel
 
using ICpuOperator = experimental::INEOperator
 
using CpuElementwiseMax = CpuElementwiseArithmetic< ArithmeticOperation::MAX >
 Class to run cpu::kernels::CpuArithmeticKernel except for maximum operation. More...
 
using CpuElementwiseMin = CpuElementwiseArithmetic< ArithmeticOperation::MIN >
 Class to run cpu::kernels::CpuArithmeticKernel except for minimum operation. More...
 
using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic< ArithmeticOperation::SQUARED_DIFF >
 Class to run cpu::kernels::CpuArithmeticKernel except for squared difference operation. More...
 
using NEEqual = CpuElementwiseComparisonStatic< ComparisonOperation::Equal >
 Basic function to run equal comparison. More...
 
using NENotEqual = CpuElementwiseComparisonStatic< ComparisonOperation::NotEqual >
 Basic function to run not equal comparison. More...
 
using NEGreater = CpuElementwiseComparisonStatic< ComparisonOperation::Greater >
 Basic function to run greater comparison. More...
 
using NEGreaterEqual = CpuElementwiseComparisonStatic< ComparisonOperation::GreaterEqual >
 Basic function to run greater-equal comparison. More...
 
using NELess = CpuElementwiseComparisonStatic< ComparisonOperation::Less >
 Basic function to run less comparison. More...
 
using NELessEqual = CpuElementwiseComparisonStatic< ComparisonOperation::LessEqual >
 Basic function to run less-equal comparison. More...
 
using KernelType = kernels::CpuElementwiseUnaryKernel
 
using CpuPRelu = CpuElementwiseArithmetic< ArithmeticOperation::PRELU >
 Class to run cpu::kernels::CpuArithmeticKernel except for PRelu operation. More...
 
using CpuSoftmax = CpuSoftmaxGeneric< false >
 
using CpuLogSoftmax = CpuSoftmaxGeneric< true >
 

Enumerations

enum  AsmConvMethod { Im2Col, Indirect, Conv }
 

Functions

void qasymm8_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qasymm8_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qasymm8_signed_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qasymm8_signed_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qsymm16_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void qsymm16_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp16_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp16_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp32_neon_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void fp32_sve_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void add_qasymm8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_signed_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qsymm16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename ScalarType >
void add_same_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename InputScalarType , typename OutputScalarType , typename InputVectorType >
void elementwise_op (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OutputScalarType(*scalar_func)(const InputScalarType &, const InputScalarType &), int(*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), int(*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
 
template<ArithmeticOperation op, typename ScalarType >
ScalarType elementwise_arithm_op_scalar (const ScalarType &a, const ScalarType &b)
 
template<ArithmeticOperation op, typename VectorType >
VectorType::type elementwise_arithm_op (const typename VectorType::type &a, const typename VectorType::type &b)
 
template<>
int32x4_t elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > > (const int32x4_t &a, const int32x4_t &b)
 
template<>
float32x4_t elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > > (const float32x4_t &a, const float32x4_t &b)
 
template<>
float32x4_t elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > > (const float32x4_t &a, const float32x4_t &b)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
VectorType::type elementwise_arithm_op_broadcast (const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
int elementwise_arithm_op_loop (int window_start_x, int window_end_x, int window_step_x, const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
int elementwise_arithm_op_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
 
template<ArithmeticOperation op, typename VectorType >
void elementwise_arithm_op (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType >
uint8_t elementwise_comp_op_scalar (const InputScalarType &a, const InputScalarType &b)
 
template<ComparisonOperation op, typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comp_op (const InputVectorType &a, const InputVectorType &b)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comp_op_broadcast (const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_8_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_16_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_32_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_8_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_16_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_32_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_8 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_16 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_32 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
float32x4x4_t load_quantized (const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 
float32x4x4_t load_quantized_signed (const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 
void store_quantized (uint8_t *output_ptr, const uint32x4x4_t &out)
 
void store_quantized (uint8_t *output_ptr, const int32x4x4_t &out)
 
void store_quantized (uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 
void store_quantized_signed (int8_t *output_ptr, const int32x4x4_t &out)
 
void store_quantized_signed (int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 
template<ArithmeticOperation op>
uint8_t elementwise_arithm_op_quantized_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ArithmeticOperation op>
int8_t elementwise_arithm_op_quantized_signed_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ArithmeticOperation op>
float32x4x4_t elementwise_arithm_op (const float32x4x4_t &a, const float32x4x4_t &b)
 
template<ComparisonOperation op>
uint8_t elementwise_comp_op_quantized_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ComparisonOperation op>
uint32x4x4_t elementwise_comp_op (const float32x4x4_t &a, const float32x4x4_t &b)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_singed_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_signed_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_signed_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_signed_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
void elementwise_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
void elementwise_comp_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
void elementwise_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
template<ArithmeticOperation op>
void elementwise_arithm_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void elementwise_arithm_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void elementwise_comp_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void elementwise_comp_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<typename ScalarType >
ScalarType elementwise_op_scalar_imp (ElementWiseUnary op, const ScalarType &a)
 
template<typename ScalarType , typename VectorType >
VectorType elementwise_op_imp (ElementWiseUnary op, const VectorType &a)
 
template<typename ScalarType >
void elementwise_op (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void fp16_neon_floor (const void *src, void *dst, int len)
 
void fp32_neon_floor (const void *src, void *dst, int len)
 
void poolingMxN_fp32_neon_nhwc (const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 
void poolingMxN_qasymm8_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
void poolingMxN_qasymm8_signed_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
void poolingMxN_fp16_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
template<typename T >
uint32_t offset_no_padding (uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
 
template<typename T >
std::enable_if< std::is_same< T, int8_t >::value, int8_t >::type quantize (float val, const UniformQuantizationInfo &info)
 
template<typename T >
std::enable_if< std::is_same< T, uint8_t >::value, uint8_t >::type quantize (float val, const UniformQuantizationInfo &info)
 
template<typename T >
vcvtq_q32_f32 (float32x4_t values)
 
template<>
uint32x4_t vcvtq_q32_f32 (float32x4_t values)
 
template<>
int32x4_t vcvtq_q32_f32 (float32x4_t values)
 
template<typename T >
float32x4_t vcvtq_f32_q32 (T values)
 
template<>
float32x4_t vcvtq_f32_q32 (uint32x4_t values)
 
template<>
float32x4_t vcvtq_f32_q32 (int32x4_t values)
 
template<typename Tout >
Tout vrequantize_pooling_with_scale (const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
 
template<>
uint8x16_t vrequantize_pooling_with_scale (const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
 
template<>
int8x16_t vrequantize_pooling_with_scale (const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
 
template<typename Tin , typename Tout >
Tout vrequantize_pooling (Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo)
 
template<>
uint8x16_t vrequantize_pooling (uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 
template<>
int8x16_t vrequantize_pooling (int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 
template<typename T >
vrequantize_pooling (T &vec, const UniformQuantizationInfo &requant_qinfo)
 
template<>
uint8x8_t vrequantize_pooling (uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 
template<>
int8x8_t vrequantize_pooling (int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 
float calculate_avg_scale (bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
 
template<typename T >
void poolingMxN_q8_neon_nhwc (const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 
void u8_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void s16_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_signed_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void nearest_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void bilinear_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void common_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void fp16_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void fp32_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void s16_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void u8_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_signed_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void neon_logits_1d_max (const ITensor *in, ITensor *out, const Window &window)
 
template<typename T >
void neon_softmax_logits_1d_quantized (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template<typename T >
void neon_softmax_logits_1d_float (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void sub_qasymm8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qasymm8_signed_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qsymm16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename T >
void sub_same_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void fp16_neon_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp16_sve_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp32_neon_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp32_sve_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 

Variables

constexpr int step = 4
 

Typedef Documentation

◆ CpuElementwiseMax

Class to run cpu::kernels::CpuArithmeticKernel except for maximum operation.

Definition at line 65 of file CpuElementwise.h.

◆ CpuElementwiseMin

Class to run cpu::kernels::CpuArithmeticKernel except for minimum operation.

Definition at line 67 of file CpuElementwise.h.

◆ CpuElementwiseSquaredDiff

Class to run cpu::kernels::CpuArithmeticKernel except for squared difference operation.

Definition at line 69 of file CpuElementwise.h.

◆ CpuLogSoftmax

Definition at line 107 of file CpuSoftmax.h.

◆ CpuPRelu

Class to run cpu::kernels::CpuArithmeticKernel except for PRelu operation.

Definition at line 34 of file CpuPRelu.h.

◆ CpuSoftmax

using CpuSoftmax = CpuSoftmaxGeneric<false>

Definition at line 106 of file CpuSoftmax.h.

◆ ICpuKernel

Definition at line 33 of file ICpuKernel.h.

◆ ICpuOperator

Definition at line 33 of file ICpuOperator.h.

◆ KernelType

◆ NEEqual

Basic function to run equal comparison.

Definition at line 171 of file CpuElementwise.h.

◆ NEGreater

Basic function to run greater comparison.

Definition at line 175 of file CpuElementwise.h.

◆ NEGreaterEqual

Basic function to run greater-equal comparison.

Definition at line 177 of file CpuElementwise.h.

◆ NELess

Basic function to run less comparison.

Definition at line 179 of file CpuElementwise.h.

◆ NELessEqual

Basic function to run less-equal comparison.

Definition at line 181 of file CpuElementwise.h.

◆ NENotEqual

Basic function to run not equal comparison.

Definition at line 173 of file CpuElementwise.h.

Enumeration Type Documentation

◆ AsmConvMethod

Function Documentation

◆ add_qasymm8_neon()

void add_qasymm8_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qasymm8.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
85 
86  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
87  const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
93 
94  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
95 
96  // Compute S elements per iteration
97  int x = window_start_x;
98  for(; x <= (window_end_x - window_step_x); x += window_step_x)
99  {
100  const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151 
152  execute_window_loop(win, [&](const Coordinates &)
153  {
154  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
155  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
156  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
157 
158  // Compute S elements per iteration
159  int x = window_start_x;
160  for(; x <= (window_end_x - window_step_x); x += window_step_x)
161  {
162  const uint8x16_t a = vld1q_u8(input1_ptr + x);
163  const uint8x16_t b = vld1q_u8(input2_ptr + x);
164 
165  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
166  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
167  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
168  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
169 
170  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
171  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
172  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
173  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
174 
175  int32x4_t rf_0{};
176  int32x4_t rf_1{};
177  int32x4_t rf_2{};
178  int32x4_t rf_3{};
179 
180 #ifdef __aarch64__
181  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
182  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
183  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
184  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
185 #else //__aarch64__
186  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
187  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
188  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
189  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
190 #endif //__aarch64__
191 
192  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
193  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
194  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
195  }
196 
197  // Compute left-over elements
198  for(; x < window_end_x; ++x)
199  {
200  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
201  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
202  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
203  }
204  },
205  input1, input2, output);
206  }
207 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
uchar quantize_qasymm8(float input, float offset, float scale)
Quantize a floating-point scalar value to 8-bit asymmetric.
Definition: helpers_asymm.h:47
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ add_qasymm8_signed_neon()

void add_qasymm8_signed_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qasymm8_signed.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8_signed(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
85 
86  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
87  const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
93  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
94 
95  // Compute S elements per iteration
96  int x = window_start_x;
97  for(; x <= (window_end_x - window_step_x); x += window_step_x)
98  {
99  const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
100 
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151  execute_window_loop(win, [&](const Coordinates &)
152  {
153  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
154  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
155  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
156 
157  // Compute S elements per iteration
158  int x = window_start_x;
159  for(; x <= (window_end_x - window_step_x); x += window_step_x)
160  {
161  const int8x16_t a = vld1q_s8(input1_ptr + x);
162  const int8x16_t b = vld1q_s8(input2_ptr + x);
163 
164  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
165  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
166  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
167  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
168 
169  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
170  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
171  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
172  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
173 
174  int32x4_t rf_0{};
175  int32x4_t rf_1{};
176  int32x4_t rf_2{};
177  int32x4_t rf_3{};
178 
179 #ifdef __aarch64__
180  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
181  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
182  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
183  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
184 #else //__aarch64__
185  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
186  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
187  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
188  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
189 #endif //__aarch64__
190 
191  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
192  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
193  vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
194  }
195 
196  // Compute left-over elements
197  for(; x < window_end_x; ++x)
198  {
199  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
200  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
201  *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info());
202  }
203  },
204  input1, input2, output);
205  }
206 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ add_qsymm16_neon()

void add_qsymm16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qsymm16.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qsymm16(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 8;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
57  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
58  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
59 
60  if(is_broadcast_across_x)
61  {
62  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
63  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
64  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
65  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
66  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
67  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
68  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
69 
70  // Clear X Dimension on execution window as we handle manually
71  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
72 
73  Iterator broadcast_input(broadcast_tensor, broadcast_win);
74  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
75  Iterator output(dst, win);
76 
77  execute_window_loop(win, [&](const Coordinates &)
78  {
79  const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
80  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
81 
82  const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
83  const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
84 
85  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
86  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
87  const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
88 
89  // Compute S elements per iteration
90  int x = window_start_x;
91  for(; x <= (window_end_x - window_step_x); x += window_step_x)
92  {
93  const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
94  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
95  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
96 
97  int32x4_t rf_0{};
98  int32x4_t rf_1{};
99 #ifdef __aarch64__
100  rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
101  rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
102 #else //__aarch64__
103  rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
104  rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
105 #endif //__aarch64__
106 
107  const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
108  vst1q_s16(output_ptr + x, pa);
109  }
110 
111  // Compute left-over elements
112  for(; x < window_end_x; ++x)
113  {
114  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
115  *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
116  }
117  },
118  broadcast_input, non_broadcast_input, output);
119  }
120  else
121  {
122  // Clear X Dimension on execution window as we handle manually
123  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
124  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
125 
126  Iterator input1(src0, input1_win);
127  Iterator input2(src1, input2_win);
128  Iterator output(dst, win);
129 
130  execute_window_loop(win, [&](const Coordinates &)
131  {
132  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
133  const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
134  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
135 
136  // Compute S elements per iteration
137  int x = window_start_x;
138  for(; x <= (window_end_x - window_step_x); x += window_step_x)
139  {
140  const int16x8_t a = vld1q_s16(input1_ptr + x);
141  const int16x8_t b = vld1q_s16(input2_ptr + x);
142 
143  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
144  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
145  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
146  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
147 
148  int32x4_t rf_0{};
149  int32x4_t rf_1{};
150 #ifdef __aarch64__
151  rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
152  rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
153 #else //__aarch64__
154  rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
155  rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
156 #endif //__aarch64__
157 
158  const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
159  vst1q_s16(output_ptr + x, pa);
160  }
161 
162  // Compute left-over elements
163  for(; x < window_end_x; ++x)
164  {
165  const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
166  const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
167  *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
168  }
169  },
170  input1, input2, output);
171  }
172 }
int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit symmetric quantization scheme.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ add_same_neon()

void arm_compute::cpu::add_same_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

SIMD vector tag type.

Definition at line 45 of file list.h.

References arm_compute::wrapper::add_sat(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), arm_compute::SATURATE, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vqadd(), arm_compute::wrapper::vstore(), Dimensions< T >::x(), and Window::x().

46 {
47  /** SIMD vector tag type. */
48  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
49 
50  // Create input windows
51  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
52  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
53 
54  // Clear X Dimension on execution window as we handle manually
55  Window win = window;
56  win.set(Window::DimX, Window::Dimension(0, 1, 1));
57 
58  constexpr int window_step_x = 16 / sizeof(ScalarType);
59  const auto window_start_x = static_cast<int>(window.x().start());
60  const auto window_end_x = static_cast<int>(window.x().end());
61  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
62 
63  if(is_broadcast_across_x)
64  {
65  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
66  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
67  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
68  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
69  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
70 
71  // Clear X Dimension on execution window as we handle manually
72  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
73 
74  Iterator broadcast_input(broadcast_tensor, broadcast_win);
75  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
76  Iterator output(dst, win);
77 
78  execute_window_loop(win, [&](const Coordinates &)
79  {
80  const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
81  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
82 
83  const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
84  const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
85 
86  // Compute S elements per iteration
87  int x = window_start_x;
88  for(; x <= (window_end_x - window_step_x); x += window_step_x)
89  {
90  const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
91  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
92  wrapper::vstore(output_ptr + x, res);
93  }
94 
95  // Compute left-over elements
96  for(; x < window_end_x; ++x)
97  {
98  const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
99  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
100  }
101  },
102  broadcast_input, non_broadcast_input, output);
103  }
104  else
105  {
106  // Clear X Dimension on execution window as we handle manually
107  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
108  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
109 
110  Iterator input1(src0, input1_win);
111  Iterator input2(src1, input2_win);
112  Iterator output(dst, win);
113 
114  execute_window_loop(win, [&](const Coordinates &)
115  {
116  const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
117  const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
118  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
119 
120  // Compute S elements per iteration
121  int x = window_start_x;
122  for(; x <= (window_end_x - window_step_x); x += window_step_x)
123  {
124  const auto val1 = wrapper::vloadq(input1_ptr + x);
125  const auto val2 = wrapper::vloadq(input2_ptr + x);
126  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
127  wrapper::vstore(output_ptr + x, res);
128  }
129 
130  // Compute left-over elements
131  for(; x < window_end_x; ++x)
132  {
133  const auto val1 = *(input1_ptr + x);
134  const auto val2 = *(input2_ptr + x);
135  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
136  }
137  },
138  input1, input2, output);
139  }
140 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ bilinear_neon_scale()

void arm_compute::cpu::bilinear_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 94 of file list.h.

References ARM_COMPUTE_ERROR, BorderSize::bottom, arm_compute::scale_utils::calculate_resize_ratio(), arm_compute::CONSTANT, arm_compute::scale_helpers::delta_bilinear(), ITensorInfo::dimension(), Window::DimY, Window::DimZ, arm_compute::execute_window_loop(), PixelValue::get(), ITensor::info(), BorderSize::left, offset(), ITensorInfo::padding(), Iterator::ptr(), ITensor::ptr_to_element(), arm_compute::REPLICATE, BorderSize::right, BorderSize::top, and type.

97 {
98  // Compute the ratio between source height and destination height
99  const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
100 
101  Iterator out(dst, window);
102  const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
103  const int in_dim_w = src->info()->dimension(1);
104  const int in_dim_h = src->info()->dimension(2);
105  const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
106 
107  // Don't increment in Y and Z direction for the input tensor
108  // A pointer to the start of this plane is needed as base for the precomputed offsets
109  Window win_in(window);
110  win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
111  win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
112  Iterator in(src, win_in);
113 
114  if(border_mode == BorderMode::CONSTANT)
115  {
116 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
117  using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
118 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
119  using ConstType = T;
120 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
121  const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
122  execute_window_loop(window, [&](const Coordinates & id)
123  {
124  const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
125  const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
126  const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
127  const int32_t in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
128  const T *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
129 
130  const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
131  const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
132  const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
133  const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
134 
135  *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
136  },
137  in, out);
138  }
139  else if(border_mode == BorderMode::REPLICATE)
140  {
141  execute_window_loop(window, [&](const Coordinates & id)
142  {
143  const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
144  const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
145  const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
146  const int in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
147 
148  auto clamped_w = utility::clamp<int>(offset, 0, in_dim_w - 1);
149  auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
150  auto clamped_h = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
151  auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
152 
153  const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
154  const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
155  const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
156  const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
157 
158  *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
159  },
160  in, out);
161  }
162  else
163  {
164  ARM_COMPUTE_ERROR("Not implemented");
165  }
166 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
half_float::half half
16-bit floating point type
Definition: Types.h:46
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Definition: DFT.cpp:155
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners=false)
Returns resize ratio between input and output with consideration of aligned corners.
Definition: ScaleUtils.cpp:27
float delta_bilinear(float a00, float a01, float a10, float a11, float dx_val, float dy_val)
Computes bilinear interpolation using the top-left, top-right, bottom-left, bottom-right pixels and t...
Definition: ScaleHelpers.h:186

◆ calculate_avg_scale()

float arm_compute::cpu::calculate_avg_scale ( bool  exclude_padding,
DataLayout  data_layout,
const Coordinates id,
const int  pool_size_x,
const int  pool_size_y,
const int  upper_bound_w,
const int  upper_bound_h,
const int  pad_x,
const int  pad_y,
const int  stride_x,
const int  stride_y 
)
inline

Definition at line 162 of file quantized.h.

References arm_compute::get_data_layout_dimension_index(), arm_compute::HEIGHT, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, and arm_compute::WIDTH.

Referenced by poolingMxN_fp32_neon_nhwc(), and poolingMxN_q8_neon_nhwc().

164 {
165  const unsigned int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
166  const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
167 
168  int start_x = id[idx_width] * stride_x - pad_x;
169  int start_y = id[idx_height] * stride_y - pad_y;
170 
171  const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
172  const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
173  if(exclude_padding)
174  {
175  start_x = std::max(0, start_x);
176  start_y = std::max(0, start_y);
177  }
178  return 1.f / ((end_y - start_y) * (end_x - start_x));
179 }
const DataLayout data_layout
Definition: Im2Col.cpp:151
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193

◆ common_neon_scale()

void arm_compute::cpu::common_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 169 of file list.h.

References arm_compute::BILINEAR, arm_compute::test::validation::dst, arm_compute::NEAREST_NEIGHBOR, and arm_compute::test::validation::src.

172 {
173  if(policy == InterpolationPolicy::BILINEAR)
174  {
175  bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
176  }
177  else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
178  {
179  nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
180  }
181 }
SimpleTensor< float > src
Definition: DFT.cpp:155

◆ elementwise_arithm_op() [1/3]

float32x4x4_t arm_compute::cpu::elementwise_arithm_op ( const float32x4x4_t &  a,
const float32x4x4_t &  b 
)
inline

Definition at line 125 of file elementwise_quantized_list.h.

126 {
127  using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
128  float32x4x4_t out =
129  {
130  {
131  elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
132  elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
133  elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
134  elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
135  }
136  };
137  return out;
138 }
SimpleTensor< float > b
Definition: DFT.cpp:157

◆ elementwise_arithm_op() [2/3]

VectorType::type arm_compute::cpu::elementwise_arithm_op ( const typename VectorType::type a,
const typename VectorType::type b 
)
inline

Definition at line 160 of file elementwise_list.h.

References ARM_COMPUTE_ERROR, arm_compute::MAX, arm_compute::MIN, arm_compute::PRELU, arm_compute::SQUARED_DIFF, type, arm_compute::wrapper::vbsl(), arm_compute::wrapper::vcgt(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmin(), arm_compute::wrapper::vmul(), and arm_compute::wrapper::vsub().

161 {
162  using vec_type = typename VectorType::type;
163  using scalar_type = typename VectorType::scalar_type;
164  using tag_type = typename VectorType::tag_type;
165 
166  vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
167 
168  switch(op)
169  {
171  res = wrapper::vmax(a, b);
172  break;
174  res = wrapper::vmin(a, b);
175  break;
177  {
178  const vec_type tmp = wrapper::vsub(a, b);
179  res = wrapper::vmul(tmp, tmp);
180  break;
181  }
183  {
184  const vec_type zero = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
185  const vec_type tmp = wrapper::vmul(a, b);
186  const auto gt = wrapper::vcgt(a, zero);
187 
188  res = wrapper::vbsl(gt, a, tmp);
189  break;
190  }
191 
192  default:
193  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
194  }
195 
196  return res;
197 }
#define PRELU(x, y)
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define MAX(x, y)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
#define MIN(x, y)
decltype(strategy::transforms) typedef type
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: bsl.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
#define SQUARED_DIFF(x, y)

◆ elementwise_arithm_op() [3/3]

void arm_compute::cpu::elementwise_arithm_op ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 269 of file elementwise_list.h.

270 {
271  using scalar_type = typename VectorType::scalar_type;
272 
273  elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
274  &elementwise_arithm_op_scalar<op, scalar_type>,
275  &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
276  &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
277 }

◆ elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > >()

float32x4_t arm_compute::cpu::elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > > ( const float32x4_t &  a,
const float32x4_t &  b 
)
inline

Definition at line 206 of file elementwise_list.h.

References arm_compute::test::validation::b, and arm_compute::wrapper::vdiv().

207 {
208  return wrapper::vdiv(a, b);
209 }
float32x2_t vdiv(const float32x2_t &a, const float32x2_t &b)
Definition: div.h:58
SimpleTensor< float > b
Definition: DFT.cpp:157

◆ elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > >()

int32x4_t arm_compute::cpu::elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > > ( const int32x4_t &  a,
const int32x4_t &  b 
)
inline

Definition at line 200 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vdiv(), and arm_compute::vfloorq_f32().

201 {
202  return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
203 }
float32x2_t vdiv(const float32x2_t &a, const float32x2_t &b)
Definition: div.h:58
SimpleTensor< float > b
Definition: DFT.cpp:157
float32x4_t vfloorq_f32(float32x4_t val)
Calculate floor of a vector.

◆ elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > >()

float32x4_t arm_compute::cpu::elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > > ( const float32x4_t &  a,
const float32x4_t &  b 
)
inline

Definition at line 212 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vdiv(), and arm_compute::wrapper::vpow().

213 {
214  return wrapper::vpow(a, b);
215 }
SimpleTensor< float > b
Definition: DFT.cpp:157
float32x4_t vpow(const float32x4_t &a, const float32x4_t &b)
Definition: pow.h:40

◆ elementwise_arithm_op_broadcast()

VectorType::type arm_compute::cpu::elementwise_arithm_op_broadcast ( const typename VectorType::type a,
const ScalarType &  broadcast_value,
const bool  reorder 
)
inline

Definition at line 232 of file elementwise_list.h.

References type, and arm_compute::wrapper::vdup_n().

233 {
234  using tag_type = typename VectorType::tag_type;
235  using vec_type = typename VectorType::type;
236 
237  vec_type broadcast_vector = wrapper::vdup_n(broadcast_value, tag_type{});
238  return elementwise_arithm_op<op, VectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
239 }
decltype(strategy::transforms) typedef type
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_arithm_op_broadcast_loop()

int arm_compute::cpu::elementwise_arithm_op_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const ScalarType *  non_broadcast_input_ptr,
const ScalarType &  broadcast_value,
ScalarType *  output_ptr,
const bool  reorder 
)
inline

Definition at line 256 of file elementwise_list.h.

References arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

258 {
259  int x = window_start_x;
260  for(; x <= (window_end_x - window_step_x); x += window_step_x)
261  {
262  const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
263  wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
264  }
265  return x;
266 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_arithm_op_loop()

int arm_compute::cpu::elementwise_arithm_op_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const ScalarType *  input1_ptr,
const ScalarType *  input2_ptr,
ScalarType *  output_ptr 
)
inline

Definition at line 242 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

244 {
245  int x = window_start_x;
246  for(; x <= (window_end_x - window_step_x); x += window_step_x)
247  {
248  const auto a = wrapper::vloadq(input1_ptr + x);
249  const auto b = wrapper::vloadq(input2_ptr + x);
250  wrapper::vstore(output_ptr + x, elementwise_arithm_op<op, VectorType>(a, b));
251  }
252  return x;
253 }
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

◆ elementwise_arithm_op_quantized()

void arm_compute::cpu::elementwise_arithm_op_quantized ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 622 of file elementwise_quantized_list.h.

References elementwise_op_quantized().

623 {
624  elementwise_op_quantized(in1, in2, out, window, &elementwise_arithm_op_quantized_scalar<op>,
625  &elementwise_arithm_op_quantized_broadcast_loop<op>,
626  &elementwise_arithm_op_quantized_loop<op>);
627 }
void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

◆ elementwise_arithm_op_quantized_broadcast_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
uint8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 199 of file elementwise_quantized_list.h.

References load_quantized(), and store_quantized().

203 {
204  int x = window_start_x;
205  for(; x <= (window_end_x - window_step_x); x += window_step_x)
206  {
207  const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
208  const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
209  store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
210  }
211  return x;
212 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_arithm_op_quantized_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  input1_ptr,
const uint8_t *  input2_ptr,
uint8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 163 of file elementwise_quantized_list.h.

References load_quantized(), and store_quantized().

167 {
168  int x = window_start_x;
169  for(; x <= (window_end_x - window_step_x); x += window_step_x)
170  {
171  // Get inputs and compute output
172  const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
173  const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
174  const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
175  store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
176  }
177  return x;
178 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)

◆ elementwise_arithm_op_quantized_scalar()

uint8_t arm_compute::cpu::elementwise_arithm_op_quantized_scalar ( const float &  a,
const float &  b,
UniformQuantizationInfo  qinfo 
)
inline

Definition at line 113 of file elementwise_quantized_list.h.

References arm_compute::quantize_qasymm8().

114 {
115  return quantize_qasymm8(elementwise_arithm_op_scalar<op>(a, b), qinfo);
116 }
SimpleTensor< float > b
Definition: DFT.cpp:157
uchar quantize_qasymm8(float input, float offset, float scale)
Quantize a floating-point scalar value to 8-bit asymmetric.
Definition: helpers_asymm.h:47
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

◆ elementwise_arithm_op_quantized_signed()

void arm_compute::cpu::elementwise_arithm_op_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 629 of file elementwise_quantized_list.h.

References elementwise_op_quantized_signed().

630 {
631  elementwise_op_quantized_signed(in1, in2, out, window, &elementwise_arithm_op_quantized_signed_scalar<op>,
632  &elementwise_arithm_op_quantized_signed_broadcast_loop<op>,
633  &elementwise_arithm_op_quantized_singed_loop<op>);
634 }
void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

◆ elementwise_arithm_op_quantized_signed_broadcast_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_signed_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
int8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 214 of file elementwise_quantized_list.h.

References load_quantized_signed(), and store_quantized_signed().

218 {
219  int x = window_start_x;
220  for(; x <= (window_end_x - window_step_x); x += window_step_x)
221  {
222  const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
223  const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
224  store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
225  }
226  return x;
227 }
void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_arithm_op_quantized_signed_scalar()

int8_t arm_compute::cpu::elementwise_arithm_op_quantized_signed_scalar ( const float &  a,
const float &  b,
UniformQuantizationInfo  qinfo 
)
inline

Definition at line 119 of file elementwise_quantized_list.h.

References arm_compute::quantize_qasymm8_signed().

120 {
121  return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
122 }
SimpleTensor< float > b
Definition: DFT.cpp:157
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

◆ elementwise_arithm_op_quantized_singed_loop()

int arm_compute::cpu::elementwise_arithm_op_quantized_singed_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  input1_ptr,
const int8_t *  input2_ptr,
int8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 181 of file elementwise_quantized_list.h.

References load_quantized_signed(), and store_quantized_signed().

185 {
186  int x = window_start_x;
187  for(; x <= (window_end_x - window_step_x); x += window_step_x)
188  {
189  // Get inputs and compute output
190  const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
191  const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
192  const float32x4x4_t rf = elementwise_arithm_op<op>(af, bf);
193  store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
194  }
195  return x;
196 }
void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)

◆ elementwise_arithm_op_scalar()

ScalarType arm_compute::cpu::elementwise_arithm_op_scalar ( const ScalarType &  a,
const ScalarType &  b 
)
inline

Definition at line 113 of file elementwise_list.h.

References ARM_COMPUTE_ERROR, arm_compute::test::validation::b, arm_compute::DIV, arm_compute::MAX, arm_compute::MIN, arm_compute::POWER, arm_compute::PRELU, and arm_compute::SQUARED_DIFF.

114 {
115  auto res = ScalarType(0);
116 
117  switch(op)
118  {
120  res = std::max(a, b);
121  break;
123  res = std::min(a, b);
124  break;
126  {
127  res = (a - b) * (a - b);
128  break;
129  }
131  {
132  res = (a > 0 ? a : a * b);
133  break;
134  }
136  {
137  res = a / b;
138  if(std::is_integral<ScalarType>::value)
139  {
140  res = (b == 0) ? 0 : res;
141  if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
142  {
143  --res;
144  }
145  }
146  break;
147  }
148  case ArithmeticOperation::POWER:
149  {
150  res = std::pow(a, b);
151  break;
152  }
153  default:
154  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
155  }
156  return res;
157 }
#define PRELU(x, y)
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define DIV(x, y)
#define MAX(x, y)
#define MIN(x, y)
#define SQUARED_DIFF(x, y)

◆ elementwise_comp_op() [1/2]

uint32x4x4_t arm_compute::cpu::elementwise_comp_op ( const float32x4x4_t &  a,
const float32x4x4_t &  b 
)
inline

Definition at line 148 of file elementwise_quantized_list.h.

149 {
150  uint32x4x4_t out =
151  {
152  {
153  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
154  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
155  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
156  elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
157  }
158  };
159  return out;
160 }
SimpleTensor< float > b
Definition: DFT.cpp:157

◆ elementwise_comp_op() [2/2]

OutputVectorType arm_compute::cpu::elementwise_comp_op ( const InputVectorType &  a,
const InputVectorType &  b 
)
inline

Definition at line 311 of file elementwise_list.h.

References ARM_COMPUTE_ERROR, arm_compute::Equal, arm_compute::Greater, arm_compute::GreaterEqual, arm_compute::Less, arm_compute::LessEqual, arm_compute::NotEqual, arm_compute::wrapper::vceq(), arm_compute::wrapper::vcge(), arm_compute::wrapper::vcgt(), and arm_compute::wrapper::vnot().

312 {
313  OutputVectorType res = { 0, 0, 0, 0 };
314 
315  switch(op)
316  {
317  case ComparisonOperation::Equal:
318  res = wrapper::vceq(a, b);
319  break;
320  case ComparisonOperation::NotEqual:
321  res = wrapper::vnot(wrapper::vceq(a, b));
322  break;
323  case ComparisonOperation::Greater:
324  res = wrapper::vcgt(a, b);
325  break;
326  case ComparisonOperation::GreaterEqual:
327  res = wrapper::vcge(a, b);
328  break;
329  case ComparisonOperation::Less:
330  res = wrapper::vcgt(b, a);
331  break;
332  case ComparisonOperation::LessEqual:
333  res = wrapper::vcge(b, a);
334  break;
335  default:
336  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
337  }
338 
339  return res;
340 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x8_t vnot(const uint8x8_t &a)
Definition: not.h:39
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vcge(const uint8x8_t &a, const uint8x8_t &b)
Definition: cge.h:39
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
Definition: ceq.h:39

◆ elementwise_comp_op_16()

void arm_compute::cpu::elementwise_comp_op_16 ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 467 of file elementwise_list.h.

468 {
469  elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
470  &elementwise_comp_op_scalar<op, InputScalarType>,
471  &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
472  &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
473 }

◆ elementwise_comp_op_16_loop()

int arm_compute::cpu::elementwise_comp_op_16_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  input1_ptr,
const InputScalarType *  input2_ptr,
uint8_t *  output_ptr 
)
inline

Definition at line 414 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

416 {
417  int x = window_start_x;
418  for(; x <= (window_end_x - window_step_x); x += window_step_x)
419  {
420  const auto a = wrapper::vloadq(input1_ptr + x);
421  const auto b = wrapper::vloadq(input2_ptr + x);
422  const auto res = elementwise_comp_op<op, InputVectorType, uint16x8_t>(a, b);
423  wrapper::vstore(output_ptr + x, wrapper::vmovn(res));
424  }
425  return x;
426 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

◆ elementwise_comp_op_32()

void arm_compute::cpu::elementwise_comp_op_32 ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 476 of file elementwise_list.h.

477 {
478  elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
479  &elementwise_comp_op_scalar<op, InputScalarType>,
480  &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
481  &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
482 }

◆ elementwise_comp_op_32_loop()

int arm_compute::cpu::elementwise_comp_op_32_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  input1_ptr,
const InputScalarType *  input2_ptr,
uint8_t *  output_ptr 
)
inline

Definition at line 429 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vcombine(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

431 {
432  int x = window_start_x;
433  for(; x <= (window_end_x - window_step_x); x += window_step_x)
434  {
435  auto a = wrapper::vloadq(input1_ptr + x);
436  auto b = wrapper::vloadq(input2_ptr + x);
437  const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
438  a = wrapper::vloadq(input1_ptr + x + 4);
439  b = wrapper::vloadq(input2_ptr + x + 4);
440  const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
442  }
443  if(x <= window_end_x - 4)
444  {
445  const auto a = wrapper::vloadq(input1_ptr + x);
446  const auto b = wrapper::vloadq(input2_ptr + x);
447  const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
448  for(int i = 0; i < 4; i++)
449  {
450  *(output_ptr + x + i) = wrapper::vgetlane(res, i);
451  }
452  x = +4;
453  }
454  return x;
455 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

◆ elementwise_comp_op_8()

void arm_compute::cpu::elementwise_comp_op_8 ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 458 of file elementwise_list.h.

459 {
460  elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
461  &elementwise_comp_op_scalar<op, InputScalarType>,
462  &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
463  &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
464 }

◆ elementwise_comp_op_8_loop()

int arm_compute::cpu::elementwise_comp_op_8_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  input1_ptr,
const InputScalarType *  input2_ptr,
uint8_t *  output_ptr 
)
inline

Definition at line 399 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

401 {
402  int x = window_start_x;
403  for(; x <= (window_end_x - window_step_x); x += window_step_x)
404  {
405  const auto a = wrapper::vloadq(input1_ptr + x);
406  const auto b = wrapper::vloadq(input2_ptr + x);
407  const auto res = elementwise_comp_op<op, InputVectorType, uint8x16_t>(a, b);
408  wrapper::vstore(output_ptr + x, res);
409  }
410  return x;
411 }
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

◆ elementwise_comp_op_broadcast()

OutputVectorType arm_compute::cpu::elementwise_comp_op_broadcast ( const InputVectorType &  a,
const InputScalarType &  broadcast_value,
const bool  reorder 
)
inline

Definition at line 343 of file elementwise_list.h.

References arm_compute::wrapper::vdup_n().

344 {
345  InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
346  return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
347 }
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_comp_op_broadcast_16_loop()

int arm_compute::cpu::elementwise_comp_op_broadcast_16_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  non_broadcast_input_ptr,
const InputScalarType &  broadcast_value,
uint8_t *  output_ptr,
const bool  reorder 
)
inline

Definition at line 363 of file elementwise_list.h.

References arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

365 {
366  int x = window_start_x;
367  for(; x <= (window_end_x - window_step_x); x += window_step_x)
368  {
369  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
370  wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
371  }
372  return x;
373 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_comp_op_broadcast_32_loop()

int arm_compute::cpu::elementwise_comp_op_broadcast_32_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  non_broadcast_input_ptr,
const InputScalarType &  broadcast_value,
uint8_t *  output_ptr,
const bool  reorder 
)
inline

Definition at line 376 of file elementwise_list.h.

References arm_compute::test::validation::b, arm_compute::wrapper::vcombine(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmovn(), and arm_compute::wrapper::vstore().

378 {
379  int x = window_start_x;
380  for(; x <= (window_end_x - window_step_x); x += window_step_x)
381  {
382  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
383  const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
385  }
386  if(x <= window_end_x - 4)
387  {
388  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
389  for(int i = 0; i < 4; i++)
390  {
391  *(output_ptr + x + i) = wrapper::vgetlane(a, i);
392  }
393  x = +4;
394  }
395  return x;
396 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
SimpleTensor< float > b
Definition: DFT.cpp:157
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_comp_op_broadcast_8_loop()

int arm_compute::cpu::elementwise_comp_op_broadcast_8_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const InputScalarType *  non_broadcast_input_ptr,
const InputScalarType &  broadcast_value,
uint8_t *  output_ptr,
const bool  reorder 
)
inline

Definition at line 350 of file elementwise_list.h.

References arm_compute::wrapper::vloadq(), and arm_compute::wrapper::vstore().

352 {
353  int x = window_start_x;
354  for(; x <= (window_end_x - window_step_x); x += window_step_x)
355  {
356  const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
357  wrapper::vstore(output_ptr + x, a);
358  }
359  return x;
360 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_comp_op_quantized()

void arm_compute::cpu::elementwise_comp_op_quantized ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 637 of file elementwise_quantized_list.h.

References elementwise_op_quantized().

638 {
639  elementwise_op_quantized(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
640  &elementwise_comp_op_quantized_broadcast_loop<op>,
641  &elementwise_comp_op_quantized_loop<op>);
642 }
void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

◆ elementwise_comp_op_quantized_broadcast_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
uint8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 266 of file elementwise_quantized_list.h.

References ARM_COMPUTE_UNUSED, load_quantized(), and store_quantized().

270 {
271  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
272  int x = window_start_x;
273  for(; x <= (window_end_x - window_step_x); x += window_step_x)
274  {
275  const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
276  const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
277  store_quantized(output_ptr + x, rf);
278  }
279  return x;
280 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_comp_op_quantized_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const uint8_t *  input1_ptr,
const uint8_t *  input2_ptr,
uint8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 230 of file elementwise_quantized_list.h.

References ARM_COMPUTE_UNUSED, load_quantized(), and store_quantized().

234 {
235  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
236  int x = window_start_x;
237  for(; x <= (window_end_x - window_step_x); x += window_step_x)
238  {
239  const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
240  const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
241  const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
242  store_quantized(output_ptr + x, rf);
243  }
244  return x;
245 }
float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

◆ elementwise_comp_op_quantized_scalar()

uint8_t arm_compute::cpu::elementwise_comp_op_quantized_scalar ( const float &  a,
const float &  b,
UniformQuantizationInfo  qinfo 
)
inline

Definition at line 141 of file elementwise_quantized_list.h.

References ARM_COMPUTE_UNUSED, and arm_compute::test::validation::b.

142 {
144  return elementwise_comp_op_scalar<op>(a, b);
145 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155

◆ elementwise_comp_op_quantized_signed()

void arm_compute::cpu::elementwise_comp_op_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window 
)

Definition at line 645 of file elementwise_quantized_list.h.

References elementwise_comp_quantized_signed().

646 {
647  elementwise_comp_quantized_signed(in1, in2, out, window, &elementwise_comp_op_quantized_scalar<op>,
648  &elementwise_comp_op_quantized_signed_broadcast_loop<op>,
649  &elementwise_comp_op_quantized_signed_loop<op>);
650 }
void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))

◆ elementwise_comp_op_quantized_signed_broadcast_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_signed_broadcast_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  non_broadcast_input_ptr,
float32x4x4_t  broadcast_vector,
uint8_t *  output_ptr,
int32x4_t  voffset_non_broadcast,
float32x4_t  vscale_non_broadcast,
float32x4_t  voffseto,
float32x4_t  invvscaleo,
bool  reorder 
)
inline

Definition at line 283 of file elementwise_quantized_list.h.

References ARM_COMPUTE_UNUSED, load_quantized_signed(), and store_quantized().

287 {
288  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
289  int x = window_start_x;
290  for(; x <= (window_end_x - window_step_x); x += window_step_x)
291  {
292  const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
293  const uint32x4x4_t rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
294  store_quantized(output_ptr + x, rf);
295  }
296  return x;
297 }
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
Sets the macro arm_any if compiling for Aarch32 or Aarch64.

◆ elementwise_comp_op_quantized_signed_loop()

int arm_compute::cpu::elementwise_comp_op_quantized_signed_loop ( int  window_start_x,
int  window_end_x,
int  window_step_x,
const int8_t *  input1_ptr,
const int8_t *  input2_ptr,
uint8_t *  output_ptr,
int32x4_t  voffset1,
int32x4_t  voffset2,
float32x4_t  vscale1,
float32x4_t  vscale2,
float32x4_t  voffseto,
float32x4_t  invvscaleo 
)
inline

Definition at line 248 of file elementwise_quantized_list.h.

References ARM_COMPUTE_UNUSED, load_quantized_signed(), and store_quantized().

252 {
253  ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
254  int x = window_start_x;
255  for(; x <= (window_end_x - window_step_x); x += window_step_x)
256  {
257  const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
258  const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
259  const uint32x4x4_t rf = elementwise_comp_op<op>(af, bf);
260  store_quantized(output_ptr + x, rf);
261  }
262  return x;
263 }
void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)

◆ elementwise_comp_op_scalar()

uint8_t arm_compute::cpu::elementwise_comp_op_scalar ( const InputScalarType &  a,
const InputScalarType &  b 
)
inline

Definition at line 280 of file elementwise_list.h.

References ARM_COMPUTE_ERROR, arm_compute::test::validation::b, arm_compute::Equal, arm_compute::Greater, arm_compute::GreaterEqual, arm_compute::Less, arm_compute::LessEqual, and arm_compute::NotEqual.

281 {
282  bool res = false;
283 
284  switch(op)
285  {
286  case ComparisonOperation::Equal:
287  res = (a == b);
288  break;
289  case ComparisonOperation::NotEqual:
290  res = (a != b);
291  break;
292  case ComparisonOperation::Greater:
293  res = (a > b);
294  break;
295  case ComparisonOperation::GreaterEqual:
296  res = (a >= b);
297  break;
298  case ComparisonOperation::Less:
299  res = (a < b);
300  break;
301  case ComparisonOperation::LessEqual:
302  res = (a <= b);
303  break;
304  default:
305  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
306  }
307  return res ? ~static_cast<uint8_t>(0) : static_cast<uint8_t>(0);
308 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

◆ elementwise_comp_quantized_signed()

void arm_compute::cpu::elementwise_comp_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
uint8_t(*)(const float &, const float &, UniformQuantizationInfo scalar_func,
int(*)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool)  broadcast_func,
int(*)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)  neon_func 
)

Definition at line 407 of file elementwise_quantized_list.h.

References arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), arm_compute::dequantize_qasymm8_signed(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), arm_compute::vdequantize(), Dimensions< T >::x(), and Window::x().

Referenced by elementwise_comp_op_quantized_signed().

414 {
415  // Create input windows
416  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
417  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
418 
419  // Clear X Dimension on execution window as we handle manually
420  Window win = window;
421  win.set(Window::DimX, Window::Dimension(0, 1, 1));
422 
423  const int window_step_x = 16;
424  const auto window_start_x = static_cast<int>(window.x().start());
425  const auto window_end_x = static_cast<int>(window.x().end());
426  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
427 
428  const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
429 
430  const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
431  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
432 
433  if(is_broadcast_across_x)
434  {
435  // Select the broadcast input on the X axis
436  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
437  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
438  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
439  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
440  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
441 
442  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
443  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
444 
445  const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
446  const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
447 
448  // Clear X Dimension on execution window as we handle manually
449  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
450 
451  Iterator broadcast_input(broadcast_tensor, broadcast_win);
452  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
453  Iterator output(out, win);
454 
455  execute_window_loop(win, [&](const Coordinates &)
456  {
457  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
458  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
459 
460  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
461  const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
462 
463  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
464  voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
465  for(; x < window_end_x; ++x)
466  {
467  const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
468  const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
469  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
470  }
471  },
472  broadcast_input, non_broadcast_input, output);
473  }
474  else
475  {
476  const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
477  const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
478 
479  // Input1 quantization info
480  const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
481  const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
482 
483  // Input2 quantization info
484  const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
485  const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
486 
487  // Clear X Dimension on execution window as we handle manually
488  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
489  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
490 
491  Iterator input1(in1, input1_win);
492  Iterator input2(in2, input2_win);
493  Iterator output(out, win);
494 
495  execute_window_loop(win, [&](const Coordinates &)
496  {
497  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
498  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
499  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
500 
501  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
502  vscale1, vscale2, voffseto, invvscaleo);
503  for(; x < window_end_x; ++x)
504  {
505  const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
506  const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
507  *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
508  }
509  },
510  input1, input2, output);
511  }
512 }
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
float dequantize_qasymm8_signed(char input, float offset, float scale)
Dequantize a scalar value from signed 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:75
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ elementwise_op() [1/2]

void arm_compute::cpu::elementwise_op ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
OutputScalarType(*)(const InputScalarType &, const InputScalarType &)  scalar_func,
int(*)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool)  broadcast_func,
int(*)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *)  neon_func 
)

Definition at line 36 of file elementwise_list.h.

References arm_compute::test::validation::b, Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), Dimensions< T >::x(), and Window::x().

40 {
41  // Create input windows
42  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
43  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
44 
45  // Clear X Dimension on execution window as we handle manually
46  Window win = window;
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  const int window_step_x = std::min(16 / static_cast<int>(sizeof(OutputScalarType)), 8);
50  const auto window_start_x = static_cast<int>(window.x().start());
51  const auto window_end_x = static_cast<int>(window.x().end());
52  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
53 
54  if(is_broadcast_across_x)
55  {
56  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
57  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
58  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
59  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
60  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
61 
62  // Clear X Dimension on execution window as we handle manually
63  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
64 
65  Iterator broadcast_input(broadcast_tensor, broadcast_win);
66  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
67  Iterator output(out, win);
68 
69  execute_window_loop(win, [&](const Coordinates &)
70  {
71  auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
72  const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
73  const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
74 
75  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
76  for(; x < window_end_x; ++x)
77  {
78  const auto a = *(non_broadcast_input_ptr + x);
79  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
80  }
81  },
82  broadcast_input, non_broadcast_input, output);
83  }
84  else
85  {
86  // Clear X Dimension on execution window as we handle manually
87  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
88  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
89 
90  Iterator input1(in1, input1_win);
91  Iterator input2(in2, input2_win);
92  Iterator output(out, win);
93 
94  execute_window_loop(win, [&](const Coordinates &)
95  {
96  auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
97  const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
98  const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
99 
100  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
101  for(; x < window_end_x; ++x)
102  {
103  const auto a = *(input1_ptr + x);
104  const auto b = *(input2_ptr + x);
105  *(output_ptr + x) = (*scalar_func)(a, b);
106  }
107  },
108  input1, input2, output);
109  }
110 }
SimpleTensor< float > b
Definition: DFT.cpp:157
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ elementwise_op() [2/2]

void arm_compute::cpu::elementwise_op ( const ITensor in,
ITensor out,
const Window window,
ElementWiseUnary  op 
)

Definition at line 83 of file elementwise_unary_list.h.

References Window::DimX, elementwise_op_scalar_imp(), Window::Dimension::end(), arm_compute::execute_window_loop(), arm_compute::test::validation::input, Iterator::ptr(), Window::set(), Window::Dimension::start(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vstore(), and Window::x().

84 {
85  const int window_step_x = 16 / sizeof(ScalarType);
86  const auto window_start_x = static_cast<int>(window.x().start());
87  const auto window_end_x = static_cast<int>(window.x().end());
88 
89  Window win = window;
90  win.set(Window::DimX, Window::Dimension(0, 1, 1));
91 
92  Iterator input(in, win);
93  Iterator output(out, win);
94 
95  execute_window_loop(win, [&](const Coordinates &)
96  {
97  auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
98  const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
99 
100  int x = window_start_x;
101  for(; x <= window_end_x - window_step_x; x += window_step_x)
102  {
103  wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
104  }
105  for(; x < window_end_x; ++x)
106  {
107  *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
108  }
109  },
110  input, output);
111 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ elementwise_op_imp()

VectorType arm_compute::cpu::elementwise_op_imp ( ElementWiseUnary  op,
const VectorType &  a 
)
inline

Definition at line 59 of file elementwise_unary_list.h.

References arm_compute::ABS, ARM_COMPUTE_ERROR, arm_compute::EXP, arm_compute::LOG, arm_compute::NEG, arm_compute::ROUND, arm_compute::RSQRT, arm_compute::SIN, arm_compute::wrapper::vabs(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vinvsqrt(), arm_compute::wrapper::vlog(), arm_compute::wrapper::vneg(), arm_compute::wrapper::vround(), and arm_compute::wrapper::vsin().

60 {
61  switch(op)
62  {
63  case ElementWiseUnary::RSQRT:
64  return wrapper::vinvsqrt(a);
65  case ElementWiseUnary::EXP:
66  return wrapper::vexpq(a);
67  case ElementWiseUnary::NEG:
68  return wrapper::vneg(a);
69  case ElementWiseUnary::LOG:
70  return wrapper::vlog(a);
71  case ElementWiseUnary::ABS:
72  return wrapper::vabs(a);
73  case ElementWiseUnary::ROUND:
74  return wrapper::vround(a);
75  case ElementWiseUnary::SIN:
76  return wrapper::vsin(a);
77  default:
78  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
79  }
80 }
float32x4_t vlog(const float32x4_t &a)
Definition: log.h:47
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x2_t vinvsqrt(const float32x2_t &a)
Definition: invsqrt.h:47
int8x8_t vabs(const int8x8_t &a)
Definition: abs.h:46
float32x4_t vsin(const float32x4_t &a)
Definition: sin.h:47
int8x8_t vneg(const int8x8_t &a)
Definition: neg.h:39
float32x4_t vround(const float32x4_t &a)
Definition: round.h:47
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47

◆ elementwise_op_quantized()

void arm_compute::cpu::elementwise_op_quantized ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
uint8_t(*)(const float &, const float &, UniformQuantizationInfo scalar_func,
int(*)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool)  broadcast_func,
int(*)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)  neon_func 
)

Definition at line 299 of file elementwise_quantized_list.h.

References arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), arm_compute::dequantize_qasymm8(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), arm_compute::vdequantize(), Dimensions< T >::x(), and Window::x().

Referenced by elementwise_arithm_op_quantized(), and elementwise_comp_op_quantized().

306 {
307  // Create input windows
308  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
309  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
310 
311  // Clear X Dimension on execution window as we handle manually
312  Window win = window;
313  win.set(Window::DimX, Window::Dimension(0, 1, 1));
314 
315  const int window_step_x = 16;
316  const auto window_start_x = static_cast<int>(window.x().start());
317  const auto window_end_x = static_cast<int>(window.x().end());
318  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
319 
320  const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
321 
322  // Output quantization info (add 0.5 to round toward the nearest integer - 0.5 rounds away from zero)
323  const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset + 0.5f);
324  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
325 
326  if(is_broadcast_across_x)
327  {
328  // Select the broadcast input on the X axis
329  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
330  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
331  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
332  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
333  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
334 
335  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
336  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
337 
338  const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
339  const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
340 
341  // Clear X Dimension on execution window as we handle manually
342  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
343 
344  Iterator broadcast_input(broadcast_tensor, broadcast_win);
345  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
346  Iterator output(out, win);
347 
348  execute_window_loop(win, [&](const Coordinates &)
349  {
350  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
351  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
352 
353  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
354  const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
355 
356  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
357  voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
358  for(; x < window_end_x; ++x)
359  {
360  const float afs = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
361  const float bfs = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
362  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
363  }
364  },
365  broadcast_input, non_broadcast_input, output);
366  }
367  else
368  {
369  const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
370  const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
371 
372  // Input1 quantization info
373  const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
374  const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
375 
376  // Input2 quantization info
377  const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
378  const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
379 
380  // Clear X Dimension on execution window as we handle manually
381  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
382  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
383 
384  Iterator input1(in1, input1_win);
385  Iterator input2(in2, input2_win);
386  Iterator output(out, win);
387 
388  execute_window_loop(win, [&](const Coordinates &)
389  {
390  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
391  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
392  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
393 
394  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
395  vscale1, vscale2, voffseto, invvscaleo);
396  for(; x < window_end_x; ++x)
397  {
398  const float afs = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
399  const float bfs = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
400  *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
401  }
402  },
403  input1, input2, output);
404  }
405 }
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
float dequantize_qasymm8(uchar input, float offset, float scale)
Dequantize a scalar value from 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:62
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ elementwise_op_quantized_signed()

void arm_compute::cpu::elementwise_op_quantized_signed ( const ITensor in1,
const ITensor in2,
ITensor out,
const Window window,
int8_t(*)(const float &, const float &, UniformQuantizationInfo scalar_func,
int(*)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool)  broadcast_func,
int(*)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t)  neon_func 
)

Definition at line 514 of file elementwise_quantized_list.h.

References arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), arm_compute::dequantize_qasymm8_signed(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), arm_compute::vdequantize(), Dimensions< T >::x(), and Window::x().

Referenced by elementwise_arithm_op_quantized_signed().

521 {
522  // Create input windows
523  Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
524  Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
525 
526  // Clear X Dimension on execution window as we handle manually
527  Window win = window;
528  win.set(Window::DimX, Window::Dimension(0, 1, 1));
529 
530  const int window_step_x = 16;
531  const auto window_start_x = static_cast<int>(window.x().start());
532  const auto window_end_x = static_cast<int>(window.x().end());
533  const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
534 
535  const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform();
536 
537  const float32x4_t voffseto = vdupq_n_f32(output_qinfo.offset);
538  const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
539 
540  if(is_broadcast_across_x)
541  {
542  // Select the broadcast input on the X axis
543  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
544  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
545  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
546  const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
547  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
548 
549  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
550  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
551 
552  const int32x4_t voffset_non_broadcast = vdupq_n_s32(non_broadcast_qinfo.offset);
553  const float32x4_t vscale_non_broadcast = vdupq_n_f32(non_broadcast_qinfo.scale);
554 
555  // Clear X Dimension on execution window as we handle manually
556  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
557 
558  Iterator broadcast_input(broadcast_tensor, broadcast_win);
559  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
560  Iterator output(out, win);
561 
562  execute_window_loop(win, [&](const Coordinates &)
563  {
564  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
565  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
566 
567  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
568  const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
569 
570  int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
571  voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
572  for(; x < window_end_x; ++x)
573  {
574  const float afs = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
575  const float bfs = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
576  *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
577  }
578  },
579  broadcast_input, non_broadcast_input, output);
580  }
581  else
582  {
583  const UniformQuantizationInfo input1_qinfo = in1->info()->quantization_info().uniform();
584  const UniformQuantizationInfo input2_qinfo = in2->info()->quantization_info().uniform();
585 
586  // Input1 quantization info
587  const int32x4_t voffset1 = vdupq_n_s32(input1_qinfo.offset);
588  const float32x4_t vscale1 = vdupq_n_f32(input1_qinfo.scale);
589 
590  // Input2 quantization info
591  const int32x4_t voffset2 = vdupq_n_s32(input2_qinfo.offset);
592  const float32x4_t vscale2 = vdupq_n_f32(input2_qinfo.scale);
593 
594  // Clear X Dimension on execution window as we handle manually
595  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
596  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
597 
598  Iterator input1(in1, input1_win);
599  Iterator input2(in2, input2_win);
600  Iterator output(out, win);
601 
602  execute_window_loop(win, [&](const Coordinates &)
603  {
604  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
605  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
606  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
607 
608  int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
609  vscale1, vscale2, voffseto, invvscaleo);
610  for(; x < window_end_x; ++x)
611  {
612  const float afs = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
613  const float bfs = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
614  *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
615  }
616  },
617  input1, input2, output);
618  }
619 }
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
Definition: NEAsymm.h:415
float dequantize_qasymm8_signed(char input, float offset, float scale)
Dequantize a scalar value from signed 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:75
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ elementwise_op_scalar_imp()

ScalarType arm_compute::cpu::elementwise_op_scalar_imp ( ElementWiseUnary  op,
const ScalarType &  a 
)
inline

Definition at line 35 of file elementwise_unary_list.h.

References arm_compute::ABS, ARM_COMPUTE_ERROR, arm_compute::EXP, arm_compute::LOG, arm_compute::support::cpp11::nearbyint(), arm_compute::NEG, arm_compute::ROUND, arm_compute::RSQRT, and arm_compute::SIN.

Referenced by elementwise_op().

36 {
37  switch(op)
38  {
39  case ElementWiseUnary::RSQRT:
40  return 1 / sqrt(a);
41  case ElementWiseUnary::EXP:
42  return std::exp(a);
43  case ElementWiseUnary::NEG:
44  return -a;
45  case ElementWiseUnary::LOG:
46  return std::log(a);
47  case ElementWiseUnary::ABS:
48  return std::abs(a);
49  case ElementWiseUnary::ROUND:
50  return support::cpp11::nearbyint(a);
51  case ElementWiseUnary::SIN:
52  return std::sin(a);
53  default:
54  ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
55  }
56 }
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
T nearbyint(T value)
Rounds the floating-point argument arg to an integer value in floating-point format, using the current rounding mode.

◆ fp16_neon_activation()

void arm_compute::cpu::fp16_neon_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_neon_batch_normalization()

void arm_compute::cpu::fp16_neon_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_neon_floor()

void arm_compute::cpu::fp16_neon_floor ( const void *  src,
void *  dst,
int  len 
)

◆ fp16_sve_activation()

void arm_compute::cpu::fp16_sve_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_sve_batch_normalization()

void arm_compute::cpu::fp16_sve_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

◆ fp16_sve_scale()

void arm_compute::cpu::fp16_sve_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

◆ fp32_neon_activation()

void fp32_neon_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

SIMD vector tag type.

SIMD vector tag type.

Definition at line 49 of file fp32.cpp.

References ActivationLayerInfo::a(), ActivationLayerInfo::ABS, ActivationLayerInfo::activation(), ARM_COMPUTE_ERROR, arm_compute::test::validation::b, ActivationLayerInfo::b(), ActivationLayerInfo::BOUNDED_RELU, Window::collapse_if_possible(), Window::DimX, Window::DimZ, ActivationLayerInfo::ELU, Window::Dimension::end(), arm_compute::execute_window_loop(), ActivationLayerInfo::HARD_SWISH, ActivationLayerInfo::IDENTITY, arm_compute::test::validation::input, ActivationLayerInfo::LEAKY_RELU, ActivationLayerInfo::LINEAR, ActivationLayerInfo::LOGISTIC, ActivationLayerInfo::LU_BOUNDED_RELU, Iterator::ptr(), ActivationLayerInfo::RELU, Window::set(), ActivationLayerInfo::SOFT_RELU, ActivationLayerInfo::SQRT, ActivationLayerInfo::SQUARE, Window::Dimension::start(), ActivationLayerInfo::TANH, arm_compute::wrapper::vabs(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vbsl(), arm_compute::wrapper::vceq(), arm_compute::wrapper::vcge(), arm_compute::wrapper::vcgt(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vinv(), arm_compute::wrapper::vinvsqrt(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vlog(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmin(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmul(), arm_compute::wrapper::vneg(), arm_compute::wrapper::vnot(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vsub(), arm_compute::wrapper::vtanh(), and Window::x().

50 {
51  /** SIMD vector tag type. */
53 
54  constexpr int window_step_x = 4;
55  const auto window_start_x = static_cast<int>(window.x().start());
56  const auto window_end_x = static_cast<int>(window.x().end());
57  const ActivationLayerInfo::ActivationFunction act = act_info.activation();
58 
59  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
60  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
61 
62  Iterator input(src, win_collapsed);
63  Iterator output(dst, win_collapsed);
64 
65  // In case of non-aarch64, a small delta value is added to the input
66  // to prevent NAN values caused by zeros in inputs to SQRT.
67  // In case of aarh64, we call vsqrt directly, so we don't use delta.
68 #ifndef __aarch64__
69  const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
70 #endif /* __aarch64__ */
71  const auto const_1 = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
72  const auto const_0 = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
73  const auto const_6 = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
74  const auto const_3 = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
75  const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
76 
77  constexpr float soft_relu_thresh = 12.f;
78  const auto vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
79 
80  const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
81  const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
82  const auto a = static_cast<float>(act_info.a());
83  const auto b = static_cast<float>(act_info.b());
84  execute_window_loop(win_collapsed, [&](const Coordinates &)
85  {
86  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
87  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
88 
89  wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
90 
91  // Compute S elements per iteration
92  int x = window_start_x;
93  for(; x <= (window_end_x - window_step_x); x += window_step_x)
94  {
95  const auto vin = wrapper::vloadq(input_ptr + x);
96  switch(act)
97  {
98  case ActivationLayerInfo::ActivationFunction::ABS:
99  tmp = wrapper::vabs(vin);
100  break;
101  case ActivationLayerInfo::ActivationFunction::LINEAR:
102  tmp = wrapper::vmla(vb, va, vin);
103  break;
104  case ActivationLayerInfo::ActivationFunction::LOGISTIC:
106  break;
107  case ActivationLayerInfo::ActivationFunction::RELU:
108  tmp = wrapper::vmax(const_0, vin);
109  break;
110  case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
111  tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
112  break;
113  case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
114  tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
115  break;
116  case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
117  tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
118  break;
119  case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
120  tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
121  break;
122  case ActivationLayerInfo::ActivationFunction::ELU:
123  tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
124  break;
125  case ActivationLayerInfo::ActivationFunction::SQRT:
126 #ifdef __aarch64__
127  tmp = wrapper::vsqrt(vin);
128 #else /* __aarch64__ */
129  {
130  const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
131  tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
132  tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
133  }
134 #endif /* __aarch64__ */
135  break;
136  case ActivationLayerInfo::ActivationFunction::SQUARE:
137  tmp = wrapper::vmul(vin, vin);
138  break;
139  case ActivationLayerInfo::ActivationFunction::TANH:
140  tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
141  break;
142  case ActivationLayerInfo::ActivationFunction::IDENTITY:
143  tmp = vin;
144  break;
145  case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
146  tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
147  break;
148  default:
149  ARM_COMPUTE_ERROR("Unsupported activation function");
150  }
151  wrapper::vstore(output_ptr + x, tmp);
152  }
153 
154  // Compute left-over elements
155  for(; x < window_end_x; ++x)
156  {
157  const float in = *(reinterpret_cast<const float *>(input_ptr + x));
158  float tmp;
159  switch(act)
160  {
161  case ActivationLayerInfo::ActivationFunction::ABS:
162  tmp = std::abs(in);
163  break;
164  case ActivationLayerInfo::ActivationFunction::LINEAR:
165  tmp = a * in + b;
166  break;
167  case ActivationLayerInfo::ActivationFunction::LOGISTIC:
168  tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
169  break;
170  case ActivationLayerInfo::ActivationFunction::RELU:
171  tmp = std::max<float>(static_cast<float>(0), in);
172  break;
173  case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
174  tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
175  break;
176  case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
177  tmp = std::min<float>(a, std::max<float>(b, in));
178  break;
179  case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
180  tmp = (in > 0) ? in : a * in;
181  break;
182  case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
183  tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
184  break;
185  case ActivationLayerInfo::ActivationFunction::ELU:
186  tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
187  break;
188  case ActivationLayerInfo::ActivationFunction::SQRT:
189  tmp = std::sqrt(in);
190  break;
191  case ActivationLayerInfo::ActivationFunction::SQUARE:
192  tmp = in * in;
193  break;
194  case ActivationLayerInfo::ActivationFunction::TANH:
195  tmp = a * std::tanh(b * in);
196  break;
197  case ActivationLayerInfo::ActivationFunction::IDENTITY:
198  tmp = in;
199  break;
200  case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
201  tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
202  break;
203  default:
204  ARM_COMPUTE_ERROR("Unsupported activation function");
205  }
206  *(output_ptr + x) = tmp;
207  }
208  },
209  input, output);
210 }
float32x4_t vlog(const float32x4_t &a)
Definition: log.h:47
float32x4_t vtanh(const float32x4_t &a)
Definition: tanh.h:40
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
float32x2_t vinvsqrt(const float32x2_t &a)
Definition: invsqrt.h:47
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
float32x2_t vinv(const float32x2_t &a)
Definition: inv.h:47
int8x8_t vabs(const int8x8_t &a)
Definition: abs.h:46
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
SimpleTensor< float > src
Definition: DFT.cpp:155
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
Definition: traits.h:132
uint8x8_t vnot(const uint8x8_t &a)
Definition: not.h:39
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Definition: min.h:39
int8x8_t vneg(const int8x8_t &a)
Definition: neg.h:39
uint8x8_t vcgt(const uint8x8_t &a, const uint8x8_t &b)
Definition: cgt.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vbsl(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: bsl.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint8x8_t vcge(const uint8x8_t &a, const uint8x8_t &b)
Definition: cge.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47
uint8x8_t vceq(const uint8x8_t &a, const uint8x8_t &b)
Definition: ceq.h:39

◆ fp32_neon_batch_normalization()

void fp32_neon_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

Definition at line 135 of file fp32.cpp.

References ActivationLayerInfo::activation(), arm_compute::test::validation::dst, ActivationLayerInfo::enabled(), arm_compute::quantization::epsilon, and arm_compute::test::validation::src.

137 {
138  if(act_info.enabled())
139  {
140  fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
141  }
142  else
143  {
144  batch_normalization<detail::dummy<float, 4>>(src, dst, mean, var, beta, gamma, epsilon, act_info, window);
145  }
146 }
SimpleTensor< float > src
Definition: DFT.cpp:155

◆ fp32_neon_floor()

void fp32_neon_floor ( const void *  src,
void *  dst,
int  len 
)

Definition at line 37 of file fp32.cpp.

References ARM_COMPUTE_ASSERT, ARM_COMPUTE_ASSERT_NOT_NULLPTR, arm_compute::test::validation::dst, arm_compute::test::validation::src, step, and arm_compute::vfloorq_f32().

38 {
41  ARM_COMPUTE_ASSERT(len >= 0);
42 
43  auto psrc = static_cast<const float *>(src);
44  auto pdst = static_cast<float *>(dst);
45 
46  for(; len >= step; len -= step)
47  {
48  vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
49  psrc += step;
50  pdst += step;
51  }
52 
53  for(; len > 0; --len)
54  {
55  *pdst = std::floor(*psrc);
56  ++pdst;
57  ++psrc;
58  }
59 }
#define ARM_COMPUTE_ASSERT(cond)
Definition: Validate.h:37
SimpleTensor< float > src
Definition: DFT.cpp:155
float32x4_t vfloorq_f32(float32x4_t val)
Calculate floor of a vector.
constexpr int step
Definition: fp32.cpp:35
#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr)
Definition: Validate.h:38

◆ fp32_sve_activation()

void arm_compute::cpu::fp32_sve_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

◆ fp32_sve_batch_normalization()

void arm_compute::cpu::fp32_sve_batch_normalization ( ITensor src,
ITensor dst,
const ITensor mean,
const ITensor var,
const ITensor beta,
const ITensor gamma,
float  epsilon,
ActivationLayerInfo act_info,
const Window window 
)

◆ fp32_sve_scale()

void arm_compute::cpu::fp32_sve_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

◆ load_quantized()

float32x4x4_t arm_compute::cpu::load_quantized ( const uint8_t *  input1_ptr,
const int32x4_t &  offset,
const float32x4_t &  scale 
)

Definition at line 33 of file elementwise_quantized_list.h.

Referenced by elementwise_arithm_op_quantized_broadcast_loop(), elementwise_arithm_op_quantized_loop(), elementwise_comp_op_quantized_broadcast_loop(), and elementwise_comp_op_quantized_loop().

34 {
35  qasymm8x16_t x = vld1q_u8(input1_ptr);
36  const float32x4x4_t out =
37  {
38  {
39  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
40  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
41  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
42  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
43  }
44  };
45  return out;
46 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
uint8x16_t qasymm8x16_t
8 bit quantized asymmetric vector with 16 elements
Definition: NEAsymm.h:37

◆ load_quantized_signed()

float32x4x4_t arm_compute::cpu::load_quantized_signed ( const int8_t *  input1_ptr,
const int32x4_t &  offset,
const float32x4_t &  scale 
)

Definition at line 48 of file elementwise_quantized_list.h.

Referenced by elementwise_arithm_op_quantized_signed_broadcast_loop(), elementwise_arithm_op_quantized_singed_loop(), elementwise_comp_op_quantized_signed_broadcast_loop(), and elementwise_comp_op_quantized_signed_loop().

49 {
50  qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
51  const float32x4x4_t out =
52  {
53  {
54  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
55  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
56  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
57  vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
58  }
59  };
60  return out;
61 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
int8x16_t qasymm8x16_signed_t
8 bit quantized signed asymmetric vector with 16 elements
Definition: NEAsymm.h:43

◆ nearest_neon_scale()

void arm_compute::cpu::nearest_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 51 of file list.h.

References BorderSize::bottom, ITensor::buffer(), arm_compute::scale_utils::calculate_resize_ratio(), ITensorInfo::dimension(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), BorderSize::left, offset(), ITensorInfo::offset_first_element_in_bytes(), ITensorInfo::padding(), Iterator::ptr(), ITensor::ptr_to_element(), BorderSize::right, arm_compute::utils::rounding::round_half_away_from_zero(), Window::set(), Window::Dimension::start(), ITensorInfo::strides_in_bytes(), BorderSize::top, arm_compute::wrapper::vloadq(), arm_compute::wrapper::vstore(), and Window::x().

53 {
54  const size_t in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
55  const size_t in_stride_w = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
56  const size_t in_stride_wc = in_stride_w * in_stride_c;
57  const size_t in_dim_h = src->info()->dimension(2);
58 
59  // Compute the ratio between source height and destination height
60  const auto hr = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
61  const auto window_start_x = static_cast<int32_t>(window.x().start());
62  const auto window_end_x = static_cast<int32_t>(window.x().end());
63  const int window_step_x = 16 / sizeof(T);
64 
65  Window win(window);
66  win.set(Window::DimX, Window::Dimension(0, 1, 1));
67  Iterator out(dst, win);
68 
69  const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
70  const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
71 
72  execute_window_loop(win, [&](const Coordinates & id)
73  {
74  const int32_t offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
75  const auto in_hi = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
76  const int offset_row = in_hi * in_stride_wc;
77  int32_t x = window_start_x;
78  const T *in_ptr = reinterpret_cast<const T *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
79 
80  for(; x <= window_end_x - window_step_x; x += window_step_x)
81  {
82  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
83  wrapper::vloadq(in_ptr + offset + offset_row + x));
84  }
85  for(; x < window_end_x; ++x)
86  {
87  *(reinterpret_cast<T *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
88  }
89  },
90  out);
91 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
T round_half_away_from_zero(T value)
Round floating-point value with half value rounding away from zero.
Definition: Rounding.h:106
SimpleTensor< float > src
Definition: DFT.cpp:155
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners=false)
Returns resize ratio between input and output with consideration of aligned corners.
Definition: ScaleUtils.cpp:27

◆ neon_logits_1d_max()

void arm_compute::cpu::neon_logits_1d_max ( const ITensor in,
ITensor out,
const Window window 
)

SIMD vector tag type.

Definition at line 37 of file list.h.

References Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), arm_compute::test::validation::input, Iterator::ptr(), Window::set(), Window::Dimension::start(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vpmax(), and Window::x().

38 {
39  /** SIMD vector tag type. */
40  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
41 
42  constexpr int window_step_x = 16 / sizeof(T);
43  const auto window_start_x = static_cast<int>(window.x().start());
44  const auto window_end_x = static_cast<int>(window.x().end());
45 
46  Window win{ window };
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48  Iterator input(in, win);
49  Iterator output(out, win);
50 
51  const int sum_stages = log2(window_step_x / 2);
52  execute_window_loop(win, [&](const Coordinates &)
53  {
54  // Get pointers
55  const auto in_ptr = reinterpret_cast<const T *>(input.ptr());
56  const auto out_ptr = reinterpret_cast<T *>(output.ptr());
57 
58  // Init max value
59  auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
60  int x = window_start_x;
61 
62  for(; x <= (window_end_x - window_step_x); x += window_step_x)
63  {
64  const auto current_value = wrapper::vloadq(in_ptr + x);
65  vec_max = wrapper::vmax(vec_max, current_value);
66  }
67  auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
68 
69  for(int i = 0; i < sum_stages; ++i)
70  {
71  carry_max = wrapper::vpmax(carry_max, carry_max);
72  }
73  T max_val = wrapper::vgetlane(carry_max, 0);
74 
75  // Compute left-over elements
76  for(; x < window_end_x; ++x)
77  {
78  max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
79  }
80 
81  *out_ptr = max_val;
82  },
83  input, output);
84 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: pmax.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39

◆ neon_softmax_logits_1d_float()

void arm_compute::cpu::neon_softmax_logits_1d_float ( const ITensor in,
const ITensor max,
void *const  tmp,
ITensor out,
const float  beta,
bool  is_log,
const Window window 
)

SIMD vector tag type.

Definition at line 260 of file list.h.

References ValidRegion::anchor, arm_compute::execute_window_loop(), ITensor::info(), input_width, Iterator::ptr(), ValidRegion::shape, arm_compute::wrapper::vadd(), ITensorInfo::valid_region(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmul(), arm_compute::wrapper::vpadd(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vsub(), and Dimensions< T >::x().

262 {
263  const int start_x = in->info()->valid_region().anchor.x();
264  const int input_width = in->info()->valid_region().shape.x();
265 
266  Iterator in_it(in, window);
267  Iterator max_it(max, window);
268  Iterator out_it(out, window);
269 
270  /** SIMD vector tag type. */
271  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
272 
273  constexpr int vec_size = 16 / sizeof(T);
274  const int sum_stages = log2(vec_size / 2);
275 
276  execute_window_loop(window, [&](const Coordinates &)
277  {
278  /* Get pointers */
279  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
280  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
281  const auto tmp_ptr = reinterpret_cast<T *>(tmp);
282 
283  T sum{};
284  T sum_inversed{};
285 
286  /* Compute exponentials and sum */
287  {
288  /* Get max value */
289  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
290  const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
291 
292  /* Init sum to zero */
293  auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
294 
295  /* Loop over row and compute exponentials and sum */
296  int x = 0;
297  for(; x <= (input_width - vec_size); x += vec_size)
298  {
299  auto vec_elements = wrapper::vloadq(in_ptr + x);
300  vec_elements = wrapper::vsub(vec_elements, vec_max);
301  if(is_log)
302  {
303  vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
304  vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
305  }
306  else
307  {
308  vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
309  vec_sum = wrapper::vadd(vec_sum, vec_elements);
310  }
311  wrapper::vstore(tmp_ptr + x, vec_elements);
312  }
313 
314  /* Reduce sum */
315  auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
316  for(int i = 0; i < sum_stages; ++i)
317  {
318  sum_res = wrapper::vpadd(sum_res, sum_res);
319  }
320  sum = wrapper::vgetlane(sum_res, 0);
321 
322  /* Run remaining elements */
323  for(; x < input_width; ++x)
324  {
325  T element{};
326 
327  if(is_log)
328  {
329  element = (in_ptr[x] - max_val) * beta;
330  sum += std::exp(element);
331  }
332  else
333  {
334  element = std::exp((in_ptr[x] - max_val) * beta);
335  sum += element;
336  }
337  tmp_ptr[x] = element;
338  }
339 
340  if(!is_log)
341  {
342  sum_inversed = T(1) / sum;
343  }
344  else
345  {
346  sum = static_cast<T>(std::log(sum));
347  }
348  }
349 
350  /* Normalize exponentials */
351  {
352  /* Loop over row and compute softmax */
353  int x = 0;
354  for(; x <= (input_width - vec_size); x += vec_size)
355  {
356  auto vec_in = wrapper::vloadq(tmp_ptr + x);
357  auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
358  if(is_log)
359  {
360  normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
361  }
362  else
363  {
364  normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
365  }
366  wrapper::vstore(out_ptr + x, normalized_value);
367  }
368  /* Run remaining elements */
369  for(; x < input_width; ++x)
370  {
371  if(is_log)
372  {
373  out_ptr[x] = tmp_ptr[x] - sum;
374  }
375  else
376  {
377  out_ptr[x] = tmp_ptr[x] * sum_inversed;
378  }
379  }
380  }
381  },
382  in_it, max_it, out_it);
383 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:187
const size_t input_width
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float32x4_t vexpq(const float32x4_t &a)
Definition: exp.h:47

◆ neon_softmax_logits_1d_quantized()

void arm_compute::cpu::neon_softmax_logits_1d_quantized ( const ITensor in,
const ITensor max,
void *const  tmp,
ITensor out,
float  beta,
bool  is_log,
const Window window 
)

Definition at line 87 of file list.h.

References ValidRegion::anchor, arm_compute::execute_window_loop(), ITensor::info(), input_width, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::utils::cast::saturate_cast(), UniformQuantizationInfo::scale, ValidRegion::shape, QuantizationInfo::uniform(), ITensorInfo::valid_region(), arm_compute::wrapper::vdup_n(), arm_compute::vexpq_f32(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vqsub(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vsub(), and Dimensions< T >::x().

89 {
90  static_assert(std::is_same<T, qasymm8_t>::value
91  || std::is_same<T, qasymm8_signed_t>::value,
92  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
93 
94  const int start_x = in->info()->valid_region().anchor.x();
95  const int input_width = in->info()->valid_region().shape.x();
96 
97  const float scale_beta = -beta * in->info()->quantization_info().uniform().scale;
98  const auto scale_beta_vec = vdupq_n_f32(scale_beta);
99 
100  Iterator in_it(in, window);
101  Iterator max_it(max, window);
102  Iterator out_it(out, window);
103  constexpr int vec_size = 16;
104 
105  execute_window_loop(window, [&](const Coordinates &)
106  {
107  /* Get pointers */
108  const auto in_ptr = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
109  const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
110  const auto tmp_ptr = reinterpret_cast<float *>(tmp);
111 
112  float sum{};
113  float sum_inversed{};
114 
115  /* Compute exponentials and sum */
116  {
117  /* Get max value */
118  const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
119  const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
120 
121  /* Init sum to zero */
122  float32x4x4_t vec_sum =
123  {
124  vdupq_n_f32(0.f),
125  vdupq_n_f32(0.f),
126  vdupq_n_f32(0.f),
127  vdupq_n_f32(0.f),
128  };
129 
130  /* Loop over row and compute exponentials and sum */
131  int x = 0;
132  for(; x <= (input_width - vec_size); x += vec_size)
133  {
134  auto vec_elements = wrapper::vloadq(in_ptr + x);
135  vec_elements = wrapper::vqsub(vec_max, vec_elements);
136  auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
137 
138  if(is_log)
139  {
140  vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
141  vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
142  vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
143  vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
144  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
145  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
146  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
147  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
148  }
149  else
150  {
151  vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
152  vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
153  vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
154  vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
155  vec_sum.val[0] = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
156  vec_sum.val[1] = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
157  vec_sum.val[2] = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
158  vec_sum.val[3] = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
159  }
160 
161  vst4q_f32(tmp_ptr + x, vec_elements_flt);
162  }
163 
164  /* Reduce sum */
165  const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
166  auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
167  sum_res = vpadd_f32(sum_res, sum_res);
168  sum = wrapper::vgetlane(sum_res, 0);
169 
170  /* Run remaining elements */
171  for(; x < input_width; ++x)
172  {
173  float element{};
174  if(is_log)
175  {
176  element = (max_val - in_ptr[x]) * scale_beta;
177  sum += std::exp(element);
178  }
179  else
180  {
181  element = std::exp((max_val - in_ptr[x]) * scale_beta);
182  sum += element;
183  }
184 
185  tmp_ptr[x] = element;
186  }
187 
188  if(!is_log)
189  {
190  sum_inversed = 256.f / sum;
191  }
192  else
193  {
194  sum = std::log(sum);
195  }
196  }
197 
198  /* Normalize exponentials */
199  {
200  constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
201  /* Loop over row and compute softmax */
202  int x = 0;
203  for(; x <= (input_width - vec_size); x += vec_size)
204  {
205  using int_vec_type = wrapper::traits::neon_vector_t<T, 16>;
206  float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
207  int_vec_type normalized_value{};
208  if(is_log)
209  {
210  const float32x4x4_t sub =
211  {
212  vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
213  vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
214  vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
215  vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
216  };
217  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
218  }
219  else
220  {
221  float32x4x4_t mul =
222  {
223  vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
224  vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
225  vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
226  vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
227  };
228 
229  if(is_qasymm8_signed)
230  {
231  const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
232  mul.val[0] = wrapper::vsub(mul.val[0], offset_vec);
233  mul.val[1] = wrapper::vsub(mul.val[1], offset_vec);
234  mul.val[2] = wrapper::vsub(mul.val[2], offset_vec);
235  mul.val[3] = wrapper::vsub(mul.val[3], offset_vec);
236  }
237 
238  normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
239  }
240  wrapper::vstore(out_ptr + x, normalized_value);
241  }
242  /* Run remaining elements */
243  for(; x < input_width; ++x)
244  {
245  if(is_log)
246  {
247  out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
248  }
249  else
250  {
251  out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
252  }
253  }
254  }
255  },
256  in_it, max_it, out_it);
257 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
const size_t input_width
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x8_t vqsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:74
T saturate_cast(T val)
Saturate a value of type T against the numeric limits of type U.
Definition: Utils.h:312
float32x4_t vexpq_f32(float32x4_t x)
Calculate exponential.
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ offset_no_padding()

uint32_t arm_compute::cpu::offset_no_padding ( uint32_t  padded_offset,
const Coordinates id,
const ITensorInfo info,
int  pool_stride_x,
int  pool_stride_y,
DataLayout  data_layout 
)
inline

Definition at line 62 of file list.h.

References BorderSize::bottom, BorderSize::left, arm_compute::NCHW, ITensorInfo::padding(), BorderSize::right, ITensorInfo::strides_in_bytes(), ITensorInfo::tensor_shape(), BorderSize::top, Dimensions< T >::y(), and Dimensions< T >::z().

63 {
64  const int pad_left = info.padding().left;
65  const int pad_right = info.padding().right;
66  const int pad_top = info.padding().top;
67  const int pad_bottom = info.padding().bottom;
68  const int in_stride_y = static_cast<int>(info.strides_in_bytes().y());
69  const int in_stride_w = static_cast<int>(info.strides_in_bytes()[3]);
70  const int pad_horiz = pad_left + pad_right;
71  const int pad_vert = pad_top + pad_bottom;
72 
73  if(data_layout == DataLayout::NCHW)
74  {
75  const uint32_t offset_base = padded_offset
76  - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
77  - pad_top * sizeof(T) /* top padding */
78  - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
79  - in_stride_w * id[3];
80 
81  return offset_base;
82  }
83  else
84  {
85  const uint32_t offset_base = padded_offset
86  - sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
87  - pad_top * sizeof(T) // top padding
88  - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
89  - in_stride_w * id[3];
90 
91  return offset_base;
92  }
93 }
const DataLayout data_layout
Definition: Im2Col.cpp:151
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
int pool_stride_x

◆ poolingMxN_fp16_neon_nhwc()

void arm_compute::cpu::poolingMxN_fp16_neon_nhwc ( const ITensor src0,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo ,
const Window window_src,
const Window window 
)

◆ poolingMxN_fp32_neon_nhwc()

void poolingMxN_fp32_neon_nhwc ( const ITensor src,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 146 of file fp32.cpp.

References calculate_avg_scale(), ITensorInfo::dimension(), Window::DimX, Window::Dimension::end(), PoolingLayerInfo::exclude_padding, arm_compute::execute_window_loop(), Size2D::height, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, ITensor::info(), PoolingLayerInfo::is_global_pooling, arm_compute::L2, arm_compute::support::cpp11::lowest(), arm_compute::MAX, arm_compute::NHWC, PadStrideInfo::pad_bottom(), PadStrideInfo::pad_left(), PadStrideInfo::pad_right(), PoolingLayerInfo::pad_stride_info, PadStrideInfo::pad_top(), PoolingLayerInfo::pool_size, pool_stride_x, PoolingLayerInfo::pool_type, Iterator::ptr(), arm_compute::test::validation::scale, Window::set(), Window::Dimension::start(), PadStrideInfo::stride(), ITensorInfo::strides_in_bytes(), ITensorInfo::tensor_shape(), Size2D::width, Window::x(), Dimensions< T >::y(), Window::y(), Dimensions< T >::z(), and Window::z().

147 {
148  if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
149  {
150  pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
151  }
152  else
153  {
154  const int window_start_x = window.x().start();
155  const int window_end_x = window.x().end();
156  const int window_step_x = 4;
157 
158  Window window_out = window;
159  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
160 
161  Iterator in(src, window_src);
162  Iterator out(dst0, window_out);
163 
164  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
165  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
166  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
167  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
168  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
169  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
170  int pool_stride_x = 0;
171  int pool_stride_y = 0;
172  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
173  const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
174  const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
175 
176  float32x4_t vres;
177 
178  execute_window_loop(window_out, [&](const Coordinates & id)
179  {
180  const int idx_width = id.y() * pool_stride_x;
181  const int idx_height = id.z() * pool_stride_y;
182  const int pool_limit_y = pool_pad_top - idx_height;
183  const int pool_limit_x = pool_pad_left - idx_width;
184 
185  const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
186  const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
187  const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
188  const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
189 
190  int x_off = window_start_x;
191  for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
192  {
193  if(pool_info.pool_type != PoolingType::MAX)
194  {
195  // Calculate scale
196  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
197  pool_stride_y);
198  const float32x4_t scale_v = vdupq_n_f32(scale);
199 
200  // Perform pooling
201  vres = vdupq_n_f32(0.0f);
202 
203  for(int y = pool_start_y; y < pool_end_y; ++y)
204  {
205  for(int x = pool_start_x; x < pool_end_x; ++x)
206  {
207  const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
208  (src->info()->strides_in_bytes().z())) + x_off);
209 
210  // Get power of 2 in case of l2 pooling and accumulate
211  if(pool_info.pool_type == PoolingType::L2)
212  {
213  vres = vmlaq_f32(vres, data, data);
214  }
215  else
216  {
217  vres = vaddq_f32(vres, data);
218  }
219  }
220  }
221  // Divide by scale
222  vres = vmulq_f32(vres, scale_v);
223  }
224  else
225  {
226  vres = vdupq_n_f32(std::numeric_limits<float>::lowest());
227  for(int y = pool_start_y; y < pool_end_y; ++y)
228  {
229  for(int x = pool_start_x; x < pool_end_x; ++x)
230  {
231  const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
232  (src->info()->strides_in_bytes().z())) + x_off);
233  vres = vmaxq_f32(vres, data);
234  }
235  }
236  }
237 
238  // Calculate square-root in case of l2 pooling
239  if(pool_info.pool_type == PoolingType::L2)
240  {
241  float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
242  static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
243  static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
244  static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
245  };
246  vres = l2_res;
247  }
248 
249  // Store result
250  vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
251  }
252 
253  // Left-overs loop
254  for(; x_off < window_end_x; ++x_off)
255  {
256  float res = 0.0f;
257 
258  if(pool_info.pool_type != PoolingType::MAX)
259  {
260  // Calculate scale
261  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
262  pool_stride_y);
263 
264  for(int y = pool_start_y; y < pool_end_y; ++y)
265  {
266  for(int x = pool_start_x; x < pool_end_x; ++x)
267  {
268  const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
269  (src->info()->strides_in_bytes().z())) + x_off);
270 
271  // Get power of 2 in case of l2 pooling and accumulate
272  if(pool_info.pool_type == PoolingType::L2)
273  {
274  res += data * data;
275  }
276  else
277  {
278  res += data;
279  }
280  }
281  }
282 
283  // Divide by scale
284  res *= scale;
285  }
286  else
287  {
289  for(int y = pool_start_y; y < pool_end_y; ++y)
290  {
291  for(int x = pool_start_x; x < pool_end_x; ++x)
292  {
293  const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
294  (src->info()->strides_in_bytes().z())) + x_off);
295  res = std::max(res, data);
296  }
297  }
298  }
299 
300  // Calculate square-root in case of l2 pooling
301  if(pool_info.pool_type == PoolingType::L2)
302  {
303  res = std::sqrt(res);
304  }
305 
306  // Store result
307  *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
308  }
309  },
310  in, out);
311  }
312 }
ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
#define MAX(x, y)
SimpleTensor< float > src
Definition: DFT.cpp:155
int pool_stride_x
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ poolingMxN_q8_neon_nhwc()

void arm_compute::cpu::poolingMxN_q8_neon_nhwc ( const ITensor src,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 182 of file quantized.h.

References ARM_COMPUTE_UNUSED, arm_compute::AVG, calculate_avg_scale(), ITensorInfo::dimension(), Window::DimX, Window::Dimension::end(), PoolingLayerInfo::exclude_padding, arm_compute::execute_window_loop(), Size2D::height, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, ITensor::info(), PoolingLayerInfo::is_global_pooling, arm_compute::MAX, arm_compute::NCHW, arm_compute::NHWC, UniformQuantizationInfo::offset, Iterator::offset(), PadStrideInfo::pad_bottom(), PadStrideInfo::pad_left(), PadStrideInfo::pad_right(), PoolingLayerInfo::pad_stride_info, PadStrideInfo::pad_top(), pool_size, PoolingLayerInfo::pool_size, pool_stride_x, PoolingLayerInfo::pool_type, Iterator::ptr(), ITensor::ptr_to_element(), ITensorInfo::quantization_info(), arm_compute::support::cpp11::round(), UniformQuantizationInfo::scale, arm_compute::test::validation::scale, Window::set(), arm_compute::test::validation::src, Window::Dimension::start(), step, PadStrideInfo::stride(), ITensorInfo::strides_in_bytes(), ITensorInfo::tensor_shape(), type, QuantizationInfo::uniform(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vaddl(), arm_compute::wrapper::vcombine(), vcvtq_f32_q32(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vext_1(), arm_compute::wrapper::vext_2(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vload(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vmovn(), arm_compute::wrapper::vpadd(), arm_compute::wrapper::vpmax(), arm_compute::wrapper::vsetlane(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vtbl(), Size2D::width, Dimensions< T >::x(), Window::x(), Dimensions< T >::y(), Window::y(), Dimensions< T >::z(), and Window::z().

183 {
184  ARM_COMPUTE_UNUSED(dst1);
185 
186  const int window_start_x = window.x().start();
187  const int window_end_x = window.x().end();
188  const int window_step_x = 16;
189  const int window_half_step_x = window_step_x / 2;
190 
191  Window window_out = window;
192  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
193 
194  Iterator in(src, window_src);
195  Iterator out(dst0, window_out);
196 
197  using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
198  using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
199  using q16_t = typename wrapper::traits::promote_t<T>;
200  using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
201  using q32_t = typename wrapper::traits::promote_t<q16_t>;
202  using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
203 
204  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
205  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
206  const int pool_pad_right = pool_info.pad_stride_info.pad_right();
207  const int pool_pad_top = pool_info.pad_stride_info.pad_top();
208  const int pool_pad_left = pool_info.pad_stride_info.pad_left();
209  const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
210 
211  int pool_stride_x = 0;
212  int pool_stride_y = 0;
213  std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
214  const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
215  const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
216 
217  const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
218  const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
219  const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
220 
221  const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
222  // "new_offset" doesn't have to consider the "half_scale_v" in its computation
223  // With a requantization performed in a single step there won't be uncertainties introduced
224  const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
225 
226  const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
227  const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
228  const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
229 
230  execute_window_loop(window_out, [&](const Coordinates & id)
231  {
232  const int idx_width = id.y() * pool_stride_x;
233  const int idx_height = id.z() * pool_stride_y;
234  const int pool_limit_y = pool_pad_top - idx_height;
235  const int pool_limit_x = pool_pad_left - idx_width;
236 
237  const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
238  const int pool_end_y = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
239  const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
240  const int pool_end_x = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
241 
242  int x_off = window_start_x;
243  for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
244  {
245  if(pool_info.pool_type != PoolingType::MAX)
246  {
247  q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
248  q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
249  q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
250  q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
251 
252  // Calculate scale
253  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
254  pool_stride_y);
255 
256  // Perform pooling
257  for(int y = pool_start_y; y < pool_end_y; ++y)
258  {
259  for(int x = pool_start_x; x < pool_end_x; ++x)
260  {
261  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
262  (src->info()->strides_in_bytes().z())) + x_off);
263 
264  const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
265  const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
266  vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
267  vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
268  vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
269  vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
270  }
271  }
272 
273  if(src_qinfo != dst_qinfo)
274  {
275  const float32x4x4_t vres =
276  {
277  {
278  vcvtq_f32_q32(vres1),
279  vcvtq_f32_q32(vres2),
280  vcvtq_f32_q32(vres3),
281  vcvtq_f32_q32(vres4),
282  }
283  };
284  const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
285  // Store result
286  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
287  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
288  }
289  else
290  {
291  const float32x4_t scale_v = vdupq_n_f32(scale);
292  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
293  vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
294  vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
295  vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
296  vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
297 
298  const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
299  const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
300  // Store result
301  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
302  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
303  }
304  }
305  else
306  {
307  q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
308 
309  for(int y = pool_start_y; y < pool_end_y; ++y)
310  {
311  for(int x = pool_start_x; x < pool_end_x; ++x)
312  {
313  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
314  (src->info()->strides_in_bytes().z())) + x_off);
315  vres = wrapper::vmax(vres, data);
316  }
317  }
318 
319  // Store result
320  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
321  requant_qinfo) :
322  vres);
323  }
324  }
325 
326  if(pool_info.pool_type == PoolingType::MAX)
327  {
328  for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
329  {
330  q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
331  for(int y = pool_start_y; y < pool_end_y; ++y)
332  {
333  for(int x = pool_start_x; x < pool_end_x; ++x)
334  {
335  const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
336  (src->info()->strides_in_bytes().z())) + x_off);
337  vres = wrapper::vmax(vres, data);
338  }
339  }
340 
341  // Store result
342  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
343  (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
344  }
345  }
346 
347  // Left-overs loop
348  for(; x_off < window_end_x; ++x_off)
349  {
350  if(pool_info.pool_type != PoolingType::MAX)
351  {
352  q32_t res = static_cast<q32_t>(0.f);
353 
354  // Calculate scale
355  const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
356  pool_stride_y);
357 
358  // Perform pooling
359  for(int y = pool_start_y; y < pool_end_y; ++y)
360  {
361  for(int x = pool_start_x; x < pool_end_x; ++x)
362  {
363  const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
364  (src->info()->strides_in_bytes().z())) + x_off);
365  res += data;
366  }
367  }
368 
369  if(src_qinfo != dst_qinfo)
370  {
371  const float res_f = static_cast<float>(res);
372  const float new_scale = quant_rescale / scale;
373  const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
374 
375  // Store result
376  *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
377  }
378  else
379  {
380  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
381  res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
382 
383  // Store result
384  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
385  }
386  }
387  else
388  {
389  T res = std::numeric_limits<T>::min();
390 
391  for(int y = pool_start_y; y < pool_end_y; ++y)
392  {
393  for(int x = pool_start_x; x < pool_end_x; ++x)
394  {
395  const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
396  (src->info()->strides_in_bytes().z())) + x_off);
397  res = std::max(res, data);
398  }
399  }
400 
401  // Store result
402  if(src_qinfo != dst_qinfo)
403  {
404  const float res_f = static_cast<float>(res);
405  *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
406  }
407  else
408  {
409  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
410  }
411  }
412  }
413 
414  },
415  in, out);
416 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
#define MAX(x, y)
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Definition: DFT.cpp:155
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
float32x4_t vcvtq_f32_q32(int32x4_t values)
Definition: quantized.h:78
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
Definition: quantized.h:162
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
int pool_stride_x
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Definition: max.h:39

◆ poolingMxN_qasymm8_neon_nhwc()

void poolingMxN_qasymm8_neon_nhwc ( const ITensor src0,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 36 of file qasymm8.cpp.

References arm_compute::test::validation::src.

37 {
38  poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
39 }
SimpleTensor< float > src
Definition: DFT.cpp:155

◆ poolingMxN_qasymm8_signed_neon_nhwc()

void poolingMxN_qasymm8_signed_neon_nhwc ( const ITensor src0,
ITensor dst0,
ITensor dst1,
PoolingLayerInfo pool_info,
const Window window_src,
const Window window 
)

Definition at line 36 of file qasymm8_signed.cpp.

References arm_compute::test::validation::src.

37 {
38  poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
39 }
SimpleTensor< float > src
Definition: DFT.cpp:155

◆ qasymm8_neon_activation()

void qasymm8_neon_activation ( const ITensor src,
ITensor dst,
const ActivationLayerInfo act_info,
const Window window 
)

Definition at line 39 of file qasymm8.cpp.

References ActivationLayerInfo::a(), ActivationLayerInfo::activation(), ARM_COMPUTE_ERROR, arm_compute::test::validation::b, ActivationLayerInfo::b(), ActivationLayerInfo::BOUNDED_RELU, Window::collapse_if_possible(), arm_compute::dequantize_qasymm8(), Window::DimX, Window::DimZ, Window::Dimension::end(), arm_compute::execute_window_loop(), ActivationLayerInfo::HARD_SWISH, ITensor::info(), arm_compute::test::validation::input, ActivationLayerInfo::LEAKY_RELU, ActivationLayerInfo::LOGISTIC, ActivationLayerInfo::LU_BOUNDED_RELU, UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8(), ActivationLayerInfo::RELU, UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), ActivationLayerInfo::TANH, QuantizationInfo::uniform(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vbsl(), arm_compute::wrapper::vcgt(), arm_compute::vdequantize(), arm_compute::wrapper::vdiv(), arm_compute::wrapper::vexpq(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmax(), arm_compute::wrapper::vmin(), arm_compute::vmlaq_qasymm8(), arm_compute::wrapper::vmul(), arm_compute::wrapper::vneg(), arm_compute::vquantize(), arm_compute::wrapper::vstore(), arm_compute::wrapper::vtanh(), and Window::x().

40 {
41  constexpr int window_step_x = 16;
42  const auto window_start_x = static_cast<int>(window.x().start());
43  const auto window_end_x = static_cast<int>(window.x().end());
44  const ActivationLayerInfo::ActivationFunction act = act_info.activation();
45 
46  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
47  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  Iterator input(src, win_collapsed);
50  Iterator output(dst, win_collapsed);
51 
52  const UniformQuantizationInfo qi_in = src->info()->quantization_info().uniform();
53  const UniformQuantizationInfo qi_out = dst->info()->quantization_info().uniform();
54  const qasymm8x16_t va = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
55  const qasymm8x16_t vb = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
56  const qasymm8_t a = quantize_qasymm8(act_info.a(), qi_in);
57  const qasymm8_t b = quantize_qasymm8(act_info.b(), qi_in);
58  const qasymm8_t const_0 = quantize_qasymm8(0.f, qi_in);
59  const qasymm8x16_t vconst_0 = vdupq_n_u8(const_0);
60  const auto vconst_1 = vdupq_n_f32(1.f);
61 #ifndef __aarch64__
62  const auto vconst_0_f32 = vdupq_n_f32(0);
63 #endif // __aarch64__
64  const float32x4_t va_f32 = vdupq_n_f32(act_info.a());
65  const float32x4_t vb_f32 = vdupq_n_f32(act_info.b());
66  const float a_f32 = act_info.a();
67  const float b_f32 = act_info.b();
68  const auto const_6_f32 = vdupq_n_f32(6.f);
69  const auto const_0_f32 = vdupq_n_f32(0.f);
70  const auto const_3_f32 = vdupq_n_f32(3.f);
71  const auto const_inv_6_f32 = vdupq_n_f32(0.166666667f);
72 
73  // Initialise scale/offset for re-quantization
74  float s = qi_in.scale / qi_out.scale;
75  float o = -qi_in.offset * s + qi_out.offset;
76  float32x4_t vs = vdupq_n_f32(s);
77  float32x4_t vo = vdupq_n_f32(o);
78 
79  execute_window_loop(win_collapsed, [&](const Coordinates &)
80  {
81  const auto input_ptr = reinterpret_cast<const qasymm8_t *>(input.ptr());
82  const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
83 
84  wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
85 
86  // Compute S elements per iteration
87  int x = window_start_x;
88  for(; x <= (window_end_x - window_step_x); x += window_step_x)
89  {
90  const auto vin = wrapper::vloadq(input_ptr + x);
91  if(act == ActivationLayerInfo::ActivationFunction::RELU)
92  {
93  // Perform activation
94  tmp = vmaxq_u8(vconst_0, vin);
95  // Re-quantize to new output space
96  tmp = vmlaq_qasymm8(tmp, vs, vo);
97  }
98  else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
99  {
100  // Perform activation
101  tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
102  // Re-quantize to new output space
103  tmp = vmlaq_qasymm8(tmp, vs, vo);
104  }
105  else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
106  {
107  // Perform activation
108  tmp = vminq_u8(va, vmaxq_u8(vb, vin));
109  // Re-quantize to new output space
110  tmp = vmlaq_qasymm8(tmp, vs, vo);
111  }
112  else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
113  {
114  // De-quantize
115  const auto vin_deq = vdequantize(vin, qi_in);
116  // Perform activation
117  const float32x4x4_t tmp_dep =
118  {
119  {
120  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
121  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
122  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
123  wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
124  }
125  };
126  // Re-quantize to new output space
127  tmp = vquantize(tmp_dep, qi_out);
128  }
129  else if(act == ActivationLayerInfo::ActivationFunction::TANH)
130  {
131  // De-quantize
132  const auto vin_deq = vdequantize(vin, qi_in);
133  // Perform activation
134  const float32x4x4_t tmp_dep =
135  {
136  {
137  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
138  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
139  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
140  wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
141  }
142  };
143  // Re-quantize to new output space
144  tmp = vquantize(tmp_dep, qi_out);
145  }
146  else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
147  {
148  // De-quantize
149  const auto vin_deq = vdequantize(vin, qi_in);
150  // Perform activation
151  const float32x4x4_t tmp_dep =
152  {
153  {
154  wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
155  wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
156  wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
157  wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
158  }
159  };
160  // Re-quantize to new output space
161  tmp = vquantize(tmp_dep, qi_out);
162  }
163  else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
164  {
165  const auto vin_deq = vdequantize(vin, qi_in);
166 
167 #ifdef __aarch64__
168  const uint32x4x4_t pos_mask =
169  {
170  {
171  wrapper::vcgtz(vin_deq.val[0]),
172  wrapper::vcgtz(vin_deq.val[1]),
173  wrapper::vcgtz(vin_deq.val[2]),
174  wrapper::vcgtz(vin_deq.val[3]),
175  }
176  };
177 #else // __aarch64__
178  const uint32x4x4_t pos_mask =
179  {
180  {
181  wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
182  wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
183  wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
184  wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
185  }
186  };
187 #endif // __aarch64__
188 
189  const float32x4x4_t tmp_dep =
190  {
191  {
192  wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
193  wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
194  wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
195  wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
196  }
197  };
198 
199  tmp = vquantize(tmp_dep, qi_out);
200  }
201  else
202  {
203  ARM_COMPUTE_ERROR("Unsupported activation function");
204  }
205  wrapper::vstore(output_ptr + x, tmp);
206  }
207 
208  // Compute left-over elements
209  for(; x < window_end_x; ++x)
210  {
211  qasymm8_t in = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
212  qasymm8_t tmp = 0;
213  if(act == ActivationLayerInfo::ActivationFunction::RELU)
214  {
215  tmp = std::max(const_0, in);
216  tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
217  }
218  else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
219  {
220  tmp = std::min(a, std::max(const_0, in));
221  tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
222  }
223  else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
224  {
225  tmp = std::min(a, std::max(b, in));
226  tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
227  }
228  else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
229  {
230  float tmp_f = dequantize_qasymm8(in, qi_in);
231  tmp_f = 1.f / (1.f + std::exp(-tmp_f));
232  tmp = quantize_qasymm8(tmp_f, qi_out);
233  }
234  else if(act == ActivationLayerInfo::ActivationFunction::TANH)
235  {
236  float tmp_f = dequantize_qasymm8(in, qi_in);
237  tmp_f = a_f32 * std::tanh(b_f32 * tmp_f);
238  tmp = quantize_qasymm8(tmp_f, qi_out);
239  }
240  else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
241  {
242  float tmp_f = dequantize_qasymm8(in, qi_in);
243  tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
244  tmp = quantize_qasymm8(tmp_f, qi_out);
245  }
246  else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
247  {
248  float tmp_f = dequantize_qasymm8(in, qi_in);
249  tmp_f = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
250  tmp = quantize_qasymm8(tmp_f, qi_out);
251  }
252  else
253  {
254  ARM_COMPUTE_ERROR("Unsupported activation function");
255  }
256  *(output_ptr + x) = tmp;
257  }
258  },
259  input, output);
260 }
float32x2_t vdiv(const float32x2_t &a, const float32x2_t &b)
Definition: div.h:58
float32x4_t vtanh(const float32x4_t &a)
Definition: tanh.h:40
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.