Compute Library
 22.08
arm_compute::cpu Namespace Reference

Namespaces

 kernel
 
 kernels
 

Data Structures

struct  ActFpImplParams
 Constant parameters needed by the activation implementation. More...
 
struct  AsmGemmInfo
 
class  CpuActivation
 Basic function to run kernels::CpuActivationKernel. More...
 
class  CpuAdd
 Basic function to run kernels::CpuAddKernel. More...
 
class  CpuAuxTensorHandler
 
struct  CpuCapabilities
 Structure that encodes the CPU capabilities to be used. More...
 
class  CpuCast
 Basic function to run kernels::CpuCastKernel. More...
 
class  CpuComplexMul
 Basic function to run kernels::CpuComplexMulKernel. More...
 
class  CpuConcatenate
 Basic function to execute concatenate tensors along a given axis. More...
 
class  CpuContext
 CPU context implementation class. More...
 
class  CpuConv2d
 Basic function to simulate a convolution layer. More...
 
class  CpuConvertFullyConnectedWeights
 Basic function to run kernels::CpuConvertFullyConnectedWeightsKernel. More...
 
class  CpuCopy
 Basic function to run kernels::CpuCopyKernel. More...
 
class  CpuDepthwiseConv2d
 Function to execute a depthwise convolution. More...
 
class  CpuDepthwiseConv2dAssemblyDispatch
 Depthwise convolution assembly kernel glue. More...
 
class  CpuDequantize
 Basic function to run kernels::CpuDequantizeKernel that dequantizes an input tensor. More...
 
class  CpuDirectConv2d
 Function to run the direct convolution. More...
 
class  CpuDirectConv3d
 Function to run the direct convolution. More...
 
class  CpuElementwiseArithmetic
 Class to run cpu::kernels::CpuArithmeticKernel except for division and power. More...
 
class  CpuElementwiseBase
 
class  CpuElementwiseComparison
 Basic function to run cpu::kernels::CpuComparisonKernel. More...
 
class  CpuElementwiseComparisonStatic
 Basic function to run cpu::kernels::CpuComparisonKernel. More...
 
class  CpuElementwiseDivision
 Basic function to run cpu::kernels::CpuArithmeticKernel for division. More...
 
class  CpuElementwisePower
 Basic function to run cpu::kernels::CpuArithmeticKernel for power. More...
 
class  CpuElementwiseUnary
 
class  CpuFill
 Basic function to run kernels::CpuFillKernel. More...
 
class  CpuFlatten
 Basic function to flatten a given input. More...
 
class  CpuFloor
 Basic function to run kernels::CpuFloorKernel. More...
 
class  CpuFullyConnected
 Basic function to compute a Fully Connected layer. More...
 
class  CpuGemm
 Basic function to execute GEMM. More...
 
class  CpuGemmAssemblyDispatch
 Assembly kernel glue. More...
 
class  CpuGemmConv2d
 Basic function to compute the convolution layer. More...
 
class  CpuGemmDirectConv2d
 
class  CpuGemmLowpMatrixMultiplyCore
 Basic function to execute GEMMLowpMatrixMultiplyCore. More...
 
class  CpuGemmLowpOutputStage
 Basic function to execute GEMMLowpQuantizeDown kernels. More...
 
class  CpuLogits1DSoftmaxKernel
 
class  CpuMaxUnpooling
 Basic function to run kernels::CpuMaxUnpoolingLayerKernel. More...
 
class  CpuMul
 Basic function to run kernels::CpuMulKernel. More...
 
class  CpuPermute
 Basic function to run kernels::CpuPermuteKernel. More...
 
class  CpuPool2d
 Basic function to simulate a pooling layer with the specified pooling operation. More...
 
class  CpuPool3d
 Basic function to simulate a pooling layer with the specified pooling operation. More...
 
class  CpuQuantize
 Basic function to run kernels::CpuQuantizeKernel that dequantizes an input tensor. More...
 
class  CpuQueue
 CPU queue implementation class. More...
 
class  CpuReshape
 Basic function to run kernels::CpuReshapeKernel. More...
 
class  CpuScale
 Basic function to compute Scale. More...
 
class  CpuSoftmaxGeneric
 Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. More...
 
class  CpuSub
 Basic function to run kernels::CpuSubKernel. More...
 
class  CpuTensor
 CPU tensor implementation class. More...
 
class  CpuTranspose
 Basic function to run kernels::CpuTransposeKernel. More...
 
class  CpuWinogradConv2d
 
class  CpuWinogradConv2dTransformInputKernel
 
class  CpuWinogradConv2dTransformOutputKernel
 
class  ICpuKernel
 

Typedefs

using ICpuOperator = experimental::INEOperator
 
using CpuElementwiseMax = CpuElementwiseArithmetic< ArithmeticOperation::MAX >
 Class to run cpu::kernels::CpuArithmeticKernel except for maximum operation. More...
 
using CpuElementwiseMin = CpuElementwiseArithmetic< ArithmeticOperation::MIN >
 Class to run cpu::kernels::CpuArithmeticKernel except for minimum operation. More...
 
using CpuElementwiseSquaredDiff = CpuElementwiseArithmetic< ArithmeticOperation::SQUARED_DIFF >
 Class to run cpu::kernels::CpuArithmeticKernel except for squared difference operation. More...
 
using NEEqual = CpuElementwiseComparisonStatic< ComparisonOperation::Equal >
 Basic function to run equal comparison. More...
 
using NENotEqual = CpuElementwiseComparisonStatic< ComparisonOperation::NotEqual >
 Basic function to run not equal comparison. More...
 
using NEGreater = CpuElementwiseComparisonStatic< ComparisonOperation::Greater >
 Basic function to run greater comparison. More...
 
using NEGreaterEqual = CpuElementwiseComparisonStatic< ComparisonOperation::GreaterEqual >
 Basic function to run greater-equal comparison. More...
 
using NELess = CpuElementwiseComparisonStatic< ComparisonOperation::Less >
 Basic function to run less comparison. More...
 
using NELessEqual = CpuElementwiseComparisonStatic< ComparisonOperation::LessEqual >
 Basic function to run less-equal comparison. More...
 
using KernelType = kernels::CpuElementwiseUnaryKernel
 
using CpuPRelu = CpuElementwiseArithmetic< ArithmeticOperation::PRELU >
 Class to run cpu::kernels::CpuArithmeticKernel except for PRelu operation. More...
 
using CpuSoftmax = CpuSoftmaxGeneric< false >
 
using CpuLogSoftmax = CpuSoftmaxGeneric< true >
 

Enumerations

enum  KernelSelectionType { Preferred, Supported }
 
enum  AsmConvMethod { Im2Col, Indirect, Conv }
 

Functions

void fp16_neon_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp16_sve_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp32_neon_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void fp32_sve_batch_normalization (ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
void neon_fp32_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
float32x4_t mask_float_vector (const float32x4_t &in, const uint32x4_t &mask)
 
template<typename T , const ActFpImplParams & P>
void fp_neon_activation_impl (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void neon_qasymm8_activation_lut (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void neon_qasymm8_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void neon_qasymm8_signed_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void neon_qsymm16_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void sve_fp32_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void sve2_qasymm8_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void sve2_qasymm8_signed_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void sve2_qsymm16_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void sve_fp16_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void neon_fp16_activation (const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 
void add_fp32_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_fp32_neon_as_1d_array (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename ScalarType >
void add_same_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename ScalarType >
void add_same_neon_as_1d_array (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon< float > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon< uint8_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon< int32_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon< int16_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon_as_1d_array< float > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon_as_1d_array< uint8_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon_as_1d_array< int32_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_neon_as_1d_array< int16_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_u8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s32_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_u8_neon_as_1d_array (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s16_neon_as_1d_array (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s32_neon_as_1d_array (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_signed_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qsymm16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_fp32_sve (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename ScalarType >
void add_same_sve (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_sve< float > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_sve< uint8_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_sve< int16_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template void add_same_sve< int32_t > (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_u8_sve (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s16_sve (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_s32_sve (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_sve2 (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qasymm8_signed_sve2 (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_qsymm16_sve2 (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_fp16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_fp16_neon_as_1d_array (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void add_fp16_sve (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void neon_fp32_boundingboxtransform (const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
 
void bounding_box_transform_qsymm16 (const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
 
template<typename T >
void bounding_box_transform (const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
 
template void bounding_box_transform< float > (const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
 
void neon_qu16_boundingboxtransform (const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
 
void neon_fp16_boundingboxtransform (const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
 
void neon_fp32_to_fp16_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
void neon_u8_to_fp16_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
void neon_fp16_to_other_dt_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
void neon_s32_to_fp16_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
void neon_qasymm8_signed_to_fp16_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
void neon_fp32_to_bfloat16_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
void neon_bfloat16_to_fp32_cast (const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
 
template<typename T >
void directconv3d_float_neon_ndhwc (const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
 
template<typename T >
void directconv3d_quantized_neon_ndhwc (const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
 
template<typename T >
float32x4_t load_as_f32 (T *ptr)
 
template<>
float32x4_t load_as_f32 (float *ptr)
 
template<>
float32x4_t load_as_f32 (int32_t *ptr)
 
template<>
float32x4_t load_as_f32 (uint32_t *ptr)
 
template<>
float32x4_t load_as_f32 (int16_t *ptr)
 
template<>
float32x4_t load_as_f32 (uint16_t *ptr)
 
template<>
float32x4_t load_as_f32 (uint8_t *ptr)
 
void fp32_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template<typename T >
void in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< float32_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< uint8_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< uint16_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< uint32_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< int8_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< int16_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
template void in_bounds_crop_window< int32_t > (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void u8_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void u16_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void u32_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void s8_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void s16_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void s32_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void fp16_in_bounds_crop_window (const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
 
void neon_fp32_deptwiseconv2dnative (const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template<typename T , typename TW >
void run_depthwise_float (const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template void run_depthwise_float< float, float > (const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template<typename T , typename TW >
void run_depthwise_quanitized8bit (const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template void run_depthwise_quanitized8bit< uint8_t, uint8_t > (const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template void run_depthwise_quanitized8bit< int8_t, int8_t > (const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template void run_depthwise_quanitized8bit< uint8_t, int8_t > (const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
void neon_qu8_deptwiseconv2dnative (const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
void neon_qp8_qu8_deptwiseconv2dnative (const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
void neon_qs8_deptwiseconv2dnative (const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
void neon_qp8_qs8_deptwiseconv2dnative (const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
void neon_fp16_deptwiseconv2dnative (const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
 
template<ArithmeticOperation op>
void neon_fp32_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_fp32_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_fp32_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op, typename VectorType >
VectorType::type elementwise_arithm_op (const typename VectorType::type &a, const typename VectorType::type &b)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
VectorType::type elementwise_arithm_op_broadcast (const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
 
template<typename InputScalarType , typename OutputScalarType , typename InputVectorType >
void elementwise_op (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, OutputScalarType(*scalar_func)(const InputScalarType &, const InputScalarType &), int(*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool), int(*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
 
template<ArithmeticOperation op, typename ScalarType >
ScalarType elementwise_arithm_op_scalar (const ScalarType &a, const ScalarType &b)
 
template<>
int32x4_t elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< int32_t, 4 > > (const int32x4_t &a, const int32x4_t &b)
 
template<>
float32x4_t elementwise_arithm_op< ArithmeticOperation::DIV, typename wrapper::traits::neon_vector< float, 4 > > (const float32x4_t &a, const float32x4_t &b)
 
template<>
float32x4_t elementwise_arithm_op< ArithmeticOperation::POWER, typename wrapper::traits::neon_vector< float, 4 > > (const float32x4_t &a, const float32x4_t &b)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
int elementwise_arithm_op_loop (int window_start_x, int window_end_x, int window_step_x, const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
 
template<ArithmeticOperation op, typename ScalarType , typename VectorType >
int elementwise_arithm_op_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
 
template<ArithmeticOperation op, typename VectorType >
void elementwise_arithm_op (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType >
uint8_t elementwise_comp_op_scalar (const InputScalarType &a, const InputScalarType &b)
 
template<ComparisonOperation op, typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comp_op (const InputVectorType &a, const InputVectorType &b)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comp_op_broadcast (const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_8_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_16_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_broadcast_32_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_8_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_16_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
int elementwise_comp_op_32_loop (int window_start_x, int window_end_x, int window_step_x, const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_8 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_16 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op, typename InputScalarType , typename InputVectorType >
void elementwise_comp_op_32 (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
float32x4x4_t load_quantized (const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 
float32x4x4_t load_quantized_signed (const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 
void store_quantized (uint8_t *output_ptr, const uint32x4x4_t &out)
 
void store_quantized (uint8_t *output_ptr, const int32x4x4_t &out)
 
void store_quantized (uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 
void store_quantized_signed (int8_t *output_ptr, const int32x4x4_t &out)
 
void store_quantized_signed (int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 
template<ArithmeticOperation op>
uint8_t elementwise_arithm_op_quantized_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ArithmeticOperation op>
int8_t elementwise_arithm_op_quantized_signed_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ArithmeticOperation op>
float32x4x4_t elementwise_arithm_op (const float32x4x4_t &a, const float32x4x4_t &b)
 
template<ComparisonOperation op>
uint8_t elementwise_comp_op_quantized_scalar (const float &a, const float &b, UniformQuantizationInfo qinfo)
 
template<ComparisonOperation op>
uint32x4x4_t elementwise_comp_op (const float32x4x4_t &a, const float32x4x4_t &b)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_singed_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ArithmeticOperation op>
int elementwise_arithm_op_quantized_signed_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_signed_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr, int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2, float32x4_t voffseto, float32x4_t invvscaleo)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
template<ComparisonOperation op>
int elementwise_comp_op_quantized_signed_broadcast_loop (int window_start_x, int window_end_x, int window_step_x, const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
 
void elementwise_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
void elementwise_comp_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, uint8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
void elementwise_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int8_t(*scalar_func)(const float &, const float &, UniformQuantizationInfo), int(*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t, float32x4_t, float32x4_t, const bool), int(*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *, int32x4_t, int32x4_t, float32x4_t, float32x4_t, float32x4_t, float32x4_t))
 
template<ArithmeticOperation op>
void elementwise_arithm_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void elementwise_arithm_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void elementwise_comp_op_quantized (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void elementwise_comp_op_quantized_signed (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void neon_s32_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void neon_s16_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_u8_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_u8_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_u8_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_u8_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_u8_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_u8_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_u8_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_s16_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s16_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_s32_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_s32_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void neon_qasymm8_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_qasymm8_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void neon_qasymm8_signed_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_qasymm8_signed_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void neon_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void sve_fp32_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve_fp32_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_fp32_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<typename ScalarType >
void elementwise_arithmetic_op (const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 
template void elementwise_arithmetic_op< float32_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window)
 
template void elementwise_arithmetic_op< float16_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window)
 
template void elementwise_arithmetic_op< int16_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window)
 
template void elementwise_arithmetic_op< int32_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window)
 
template<typename InputScalarType , typename OutputScalarType >
void elementwise_comparison_op (const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 
template void elementwise_comparison_op< float32_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window)
 
template void elementwise_comparison_op< float16_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window)
 
template void elementwise_comparison_op< uint8_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window)
 
template void elementwise_comparison_op< int16_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window)
 
template void elementwise_comparison_op< int32_t > (const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window)
 
template<>
svint32_t elementwise_pow< svint32_t > (svbool_t &pg, const svint32_t &a, const svint32_t &b)
 
template<>
svint32_t elementwise_div< svint32_t > (svbool_t &pg, const svint32_t &a, const svint32_t &b)
 
template<>
svint16_t elementwise_div< svint16_t > (svbool_t &pg, const svint16_t &a, const svint16_t &b)
 
template<typename VectorType >
VectorType elementwise_pow (svbool_t &pg, const VectorType &a, const VectorType &b)
 
template<typename VectorType >
VectorType elementwise_div (svbool_t &pg, const VectorType &a, const VectorType &b)
 
template<uint32_t bytewidth>
svbool_t narrow_to_byte_predicate (svbool_t pg)
 
template<typename VectorType >
VectorType elementwise_arithmetic_op (svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
 
template<typename InputVectorType , typename OutputVectorType >
OutputVectorType elementwise_comparison_op (svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
 
template<ArithmeticOperation op>
void sve_s32_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void sve_s16_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve_u8_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_u8_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_u8_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_u8_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_u8_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_u8_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_u8_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve_s16_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s16_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve_s32_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve_s32_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
svfloat32x4_t load_quantized (const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
 
svfloat32x4_t load_quantized (const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
 
void store_quantized (uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 
void store_quantized (int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 
template<typename ScalarType >
void elementwise_arithmetic_quantized_op (const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 
template<typename InputScalarType , typename OutputScalarType = uint8_t>
void elementwise_comparison_quantized_op (const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 
template<ArithmeticOperation op>
void sve2_qasymm8_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve2_qasymm8_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void sve2_qasymm8_signed_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::ADD > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::SUB > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::DIV > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::MIN > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::MAX > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::SQUARED_DIFF > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::POWER > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_elementwise_binary< ArithmeticOperation::PRELU > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve2_qasymm8_signed_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::Equal > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::NotEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::Greater > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::GreaterEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::Less > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void sve2_qasymm8_signed_comparison_elementwise_binary< ComparisonOperation::LessEqual > (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void sve_fp16_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ArithmeticOperation op>
void neon_fp16_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void sve_fp16_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<ComparisonOperation op>
void neon_fp16_comparison_elementwise_binary (const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
void neon_fp32_elementwise_unary (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template<typename ScalarType >
ScalarType elementwise_op_scalar_imp (ElementWiseUnary op, const ScalarType &a)
 
template<typename ScalarType , typename VectorType >
VectorType elementwise_op_imp (ElementWiseUnary op, const VectorType &a)
 
template<typename ScalarType >
void elementwise_op (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template void elementwise_op< float > (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template void elementwise_op< int32_t > (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void neon_s32_elementwise_unary (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void sve_fp32_elementwise_unary (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template<typename ScalarType , typename VectorType >
std::enable_if< utils::traits::is_floating_point< ScalarType >::value, VectorType >::type elementwise_op_sve_imp (svbool_t pg, ElementWiseUnary op, const VectorType &a)
 
template<typename ScalarType , typename VectorType >
std::enable_if< std::is_integral< ScalarType >::value, VectorType >::type elementwise_op_sve_imp (svbool_t pg, ElementWiseUnary op, const VectorType &a)
 
template<typename ScalarType >
void elementwise_sve_op (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template void elementwise_sve_op< float16_t > (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template void elementwise_sve_op< float32_t > (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
template void elementwise_sve_op< int32_t > (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void sve_s32_elementwise_unary (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void sve_fp16_elementwise_unary (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void neon_fp16_elementwise_unary (const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
 
void fp16_neon_floor (const void *src, void *dst, int len)
 
void fp32_neon_floor (const void *src, void *dst, int len)
 
void fused_batch_normalization_conv_f32 (const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
template<typename T >
void fused_batch_normalization_conv (const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
template void fused_batch_normalization_conv< float32_t > (const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
void fused_batch_normalization_conv_f16 (const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
void fused_batch_normalization_dwc_nhwc_f16 (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
void fused_batch_normalization_dwc_nhwc_f32 (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
void fused_batch_normalization_dwc_nchw_f16 (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
void fused_batch_normalization_dwc_nchw_f32 (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
template<typename T >
void fused_batch_normalization_dwc_nchw (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
template<typename T >
void fused_batch_normalization_dwc_nhwc (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
template void fused_batch_normalization_dwc_nhwc< float32_t > (const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
 
void neon_fp32_gemm_matrix_add (const ITensor *src, ITensor *dst, const Window &window, float beta)
 
void matrix_addition_f32 (const ITensor *src, ITensor *dst, const Window &window, float beta)
 
void neon_fp16_gemm_matrix_add (const ITensor *src, ITensor *dst, const Window &window, float beta)
 
void neon_fp32_gemm_matrix_mul (const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
 
void vector_matrix_multiply_f32 (const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 
void matrix_matrix_multiply_f32 (const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 
void neon_fp16_gemm_matrix_mul (const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
 
void neon_fp32_computeallanchors (const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 
template<typename T >
void compute_all_anchors (const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 
template void compute_all_anchors< float > (const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 
void compute_all_anchors_qasymm16 (const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 
void neon_qu16_computeallanchors (const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 
void neon_fp16_computeallanchors (const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
 
void neon_fp32_instancenorm (ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
 
template<typename InputType , typename AccType >
void vector_float_sum (AccType &result, AccType &result_square, const InputType &inputs)
 
template<typename InputType , typename AccType >
InputType vector_float_norm (const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
 
template<typename T , typename AccType >
void instance_normalization_nchw (ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 
template void instance_normalization_nchw< float > (ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 
void neon_fp16_instancenorm (ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
 
void neon_fp32_l2_normalize_x (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 
void neon_fp32_l2_normalize_yz (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 
template<typename T , int S>
void l2_normalize_x (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
 
template<typename T , int S>
void l2_normalize_yz (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 
template void l2_normalize_yz< float, 4 > (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 
template void l2_normalize_x< float, 4 > (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window)
 
void neon_fp16_l2_normalize_x (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 
void neon_fp16_l2_normalize_yz (const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 
void neon_fp32_maxunpooling (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
template<typename T >
void max_unpooling (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
template void max_unpooling< float > (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
template void max_unpooling< int8_t > (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
template void max_unpooling< uint8_t > (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
void neon_qs8_maxunpooling (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
void neon_qu8_maxunpooling (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
void neon_fp16_maxunpooling (const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)
 
void neon_fp32_meanstddevnorm (ITensor *input, ITensor *output, float epsilon, const Window &window)
 
template<typename ScalarType , int size>
void mean_stddev_normalization (ITensor *input, ITensor *output, float epsilon, const Window &window)
 
template void mean_stddev_normalization< float, 4 > (ITensor *input, ITensor *output, float epsilon, const Window &window)
 
void neon_fp16_meanstddevnorm (ITensor *input, ITensor *output, float epsilon, const Window &window)
 
void poolingMxN_fp32_neon_nhwc (const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 
void poolingMxN_qasymm8_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
void poolingMxN_qasymm8_signed_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
void poolingMxN_fp16_neon_nhwc (const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
 
template<typename T >
uint32_t offset_no_padding (uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
 
template<typename T >
void poolingMxN_q8_neon_nhwc (const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
 
void neon_q8_pool3d (const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
 
void neon_q8_signed_pool3d (const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
 
void neon_fp16_pool3d (const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
 
void neon_fp32_pool3d (const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
 
template<typename T >
void poolingMxNxD_fp_neon_ndhwc (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
 
template<typename T >
void poolingMxNxD_q8_neon_ndhwc (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
 
template void poolingMxNxD_fp_neon_ndhwc< float > (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
 
template void poolingMxNxD_q8_neon_ndhwc< uint8_t > (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
 
template void poolingMxNxD_q8_neon_ndhwc< int8_t > (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
 
template<typename T >
void avg_poolingMxNxD_q8_neon_ndhwc (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
template<typename T >
void max_poolingMxNxD_q8_neon_ndhwc (const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
void fp32_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
template<typename T >
void neon_range_function (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< uint8_t > (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< uint16_t > (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< uint32_t > (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< int8_t > (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< int16_t > (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< int32_t > (ITensor *output, float start, float step, const Window &window)
 
template void neon_range_function< float32_t > (ITensor *output, float start, float step, const Window &window)
 
void u8_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void u16_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void u32_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void s8_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void s16_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void s32_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void fp16_neon_range_function (ITensor *output, float start, float step, const Window &window)
 
void neon_fp32_roialign (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
template<typename input_data_type >
input_data_type roi_align_1x1 (const ITensor *input, unsigned int roi_batch, float region_start_x, float bin_size_x, int grid_size_x, float region_end_x, float region_start_y, float bin_size_y, int grid_size_y, float region_end_y, int pz)
 Average pooling over an aligned window. More...
 
template<typename input_data_type >
input_data_type roi_align_1x1_qasymm8 (const ITensor *input, unsigned int roi_batch, float region_start_x, float bin_size_x, int grid_size_x, float region_end_x, float region_start_y, float bin_size_y, int grid_size_y, float region_end_y, int pz, const QuantizationInfo &out_qinfo)
 Average pooling over an aligned window. More...
 
float compute_region_coordinate (int p, float bin_size, float roi_anchor, float max_value)
 
template<typename input_data_type , typename roi_data_type >
void roi_align (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
template void roi_align< float, float > (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
template void roi_align< uint8_t, uint16_t > (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
template void roi_align< int8_t, uint16_t > (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
void neon_qu8_roialign (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
void neon_qs8_roialign (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
void neon_fp16_roialign (const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
 
void u8_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void s16_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_signed_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void nearest_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void bilinear_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
template<typename T >
void common_neon_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void fp32_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void u8_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void s16_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void fp16_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void qasymm8_signed_sve_scale (const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
 
void neon_f32_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_f32_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
template<typename ScalarType , typename VectorType >
void select_op (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType(*condition_conversion)(const uint8_t *))
 
template<typename ScalarType , typename VectorType >
void select_op_8 (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<typename ScalarType , typename VectorType >
void select_op_16 (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<typename ScalarType , typename VectorType >
void select_op_32 (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template<typename ScalarType >
void select_op_not_same_rank (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_32< float, uint32x4_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< float > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_8< int8_t, uint8x16_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_16< int16_t, uint16x8_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_32< int32_t, uint32x4_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< int8_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< int16_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< int32_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_8< uint8_t, uint8x16_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_16< uint16_t, uint16x8_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_32< uint32_t, uint32x4_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< uint8_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< uint16_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
template void select_op_not_same_rank< uint32_t > (const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 
void neon_s8_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_s16_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_s32_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_s8_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_s16_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_s32_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_u8_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_u16_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_u32_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_u8_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_u16_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_u32_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_f16_select_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_f16_select_not_same_rank (const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 
void neon_fp32_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void neon_fp32_logits (const ITensor *in, ITensor *out, const Window &window)
 
template<typename T >
void neon_logits_1d_max (const ITensor *in, ITensor *out, const Window &window)
 
template void neon_logits_1d_max< float > (const ITensor *in, ITensor *out, const Window &window)
 
template void neon_logits_1d_max< qasymm8_signed_t > (const ITensor *in, ITensor *out, const Window &window)
 
template void neon_logits_1d_max< qasymm8_t > (const ITensor *in, ITensor *out, const Window &window)
 
template<typename T >
void neon_softmax_logits_1d_quantized (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template void neon_softmax_logits_1d_quantized< qasymm8_signed_t > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template void neon_softmax_logits_1d_quantized< qasymm8_t > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template<typename T >
void neon_softmax_logits_1d_float (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
template void neon_softmax_logits_1d_float< float > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void neon_qasymm8_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void neon_qasymm8_logits (const ITensor *in, ITensor *out, const Window &window)
 
void neon_qasymm8_signed_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void neon_qasymm8_singed_logits (const ITensor *in, ITensor *out, const Window &window)
 
void sve_fp32_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void sve_fp32_logits (const ITensor *in, ITensor *out, const Window &window)
 
template<typename ScalarType >
void sve_logits_1d_max (const ITensor *in, ITensor *out, const Window &window)
 
template<typename ScalarType >
void sve_softmax_logits_1d_float (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
template void sve_logits_1d_max< float > (const ITensor *in, ITensor *out, const Window &window)
 
template void sve_logits_1d_max< float16_t > (const ITensor *in, ITensor *out, const Window &window)
 
template void sve_logits_1d_max< qasymm8_t > (const ITensor *in, ITensor *out, const Window &window)
 
template void sve_logits_1d_max< qasymm8_signed_t > (const ITensor *in, ITensor *out, const Window &window)
 
template void sve_softmax_logits_1d_float< float > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
template void sve_softmax_logits_1d_float< float16_t > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void sve_qasymm8_logits (const ITensor *in, ITensor *out, const Window &window)
 
void sve_qasymm8_signed_logits (const ITensor *in, ITensor *out, const Window &window)
 
template<typename ScalarType >
void sve2_softmax_logits_1d_quantized (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template void sve2_softmax_logits_1d_quantized< qasymm8_signed_t > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
template void sve2_softmax_logits_1d_quantized< qasymm8_t > (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 
void sve2_qasymm8_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void sve2_qasymm8_signed_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void neon_fp16_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void sve_fp16_softmax (const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, bool is_log, const Window &window)
 
void neon_fp16_logits (const ITensor *in, ITensor *out, const Window &window)
 
void sve_fp16_logits (const ITensor *in, ITensor *out, const Window &window)
 
void sub_qasymm8_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qasymm8_signed_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
void sub_qsymm16_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 
template<typename T >
void sub_same_neon (const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 

Variables

constexpr int step = 4
 

Typedef Documentation

◆ CpuElementwiseMax

Class to run cpu::kernels::CpuArithmeticKernel except for maximum operation.

Definition at line 65 of file CpuElementwise.h.

◆ CpuElementwiseMin

Class to run cpu::kernels::CpuArithmeticKernel except for minimum operation.

Definition at line 67 of file CpuElementwise.h.

◆ CpuElementwiseSquaredDiff

Class to run cpu::kernels::CpuArithmeticKernel except for squared difference operation.

Definition at line 69 of file CpuElementwise.h.

◆ CpuLogSoftmax

Definition at line 107 of file CpuSoftmax.h.

◆ CpuPRelu

Class to run cpu::kernels::CpuArithmeticKernel except for PRelu operation.

Definition at line 34 of file CpuPRelu.h.

◆ CpuSoftmax

using CpuSoftmax = CpuSoftmaxGeneric<false>

Definition at line 106 of file CpuSoftmax.h.

◆ ICpuOperator

Definition at line 33 of file ICpuOperator.h.

◆ KernelType

◆ NEEqual

Basic function to run equal comparison.

Definition at line 171 of file CpuElementwise.h.

◆ NEGreater

Basic function to run greater comparison.

Definition at line 175 of file CpuElementwise.h.

◆ NEGreaterEqual

Basic function to run greater-equal comparison.

Definition at line 177 of file CpuElementwise.h.

◆ NELess

Basic function to run less comparison.

Definition at line 179 of file CpuElementwise.h.

◆ NELessEqual

Basic function to run less-equal comparison.

Definition at line 181 of file CpuElementwise.h.

◆ NENotEqual

Basic function to run not equal comparison.

Definition at line 173 of file CpuElementwise.h.

Enumeration Type Documentation

◆ AsmConvMethod

◆ KernelSelectionType

enum KernelSelectionType
strong
Enumerator
Preferred 

Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags.

Supported 

Retrieve the best implementation available for the given Cpu ISA that is supported by the current build.

Definition at line 34 of file ICpuKernel.h.

35 {
36  Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */
37  Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
38 };
Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags...
Retrieve the best implementation available for the given Cpu ISA that is supported by the current bui...

Function Documentation

◆ add_fp16_neon()

void arm_compute::cpu::add_fp16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_fp16_neon_as_1d_array()

void arm_compute::cpu::add_fp16_neon_as_1d_array ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_fp16_sve()

void arm_compute::cpu::add_fp16_sve ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_fp32_neon()

void add_fp32_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 31 of file fp32.cpp.

References add_same_neon< float >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

32 {
33  return add_same_neon<float>(src0, src1, dst, policy, window);
34 }
template void add_same_neon< float >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_fp32_neon_as_1d_array()

void add_fp32_neon_as_1d_array ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 36 of file fp32.cpp.

References add_same_neon_as_1d_array< float >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

37 {
38  return add_same_neon_as_1d_array<float>(src0, src1, dst, policy, window);
39 }
template void add_same_neon_as_1d_array< float >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_fp32_sve()

void add_fp32_sve ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 33 of file fp32.cpp.

References add_same_sve< float >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

34 {
35  return add_same_sve<float>(src0, src1, dst, policy, window);
36 }
template void add_same_sve< float >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_qasymm8_neon()

void add_qasymm8_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qasymm8.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
85 
86  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
87  const uint8x16_t broadcast_value_vec = vdupq_n_u8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(broadcast_value_vec))))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(broadcast_value_vec))))), voffset2)), vscale2);
93 
94  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
95 
96  // Compute S elements per iteration
97  int x = window_start_x;
98  for(; x <= (window_end_x - window_step_x); x += window_step_x)
99  {
100  const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151 
152  execute_window_loop(win, [&](const Coordinates &)
153  {
154  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
155  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
156  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
157 
158  // Compute S elements per iteration
159  int x = window_start_x;
160  for(; x <= (window_end_x - window_step_x); x += window_step_x)
161  {
162  const uint8x16_t a = vld1q_u8(input1_ptr + x);
163  const uint8x16_t b = vld1q_u8(input2_ptr + x);
164 
165  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
166  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(a))))), voffset1)), vscale1);
167  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
168  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(a))))), voffset1)), vscale1);
169 
170  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
171  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(b))))), voffset2)), vscale2);
172  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
173  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(b))))), voffset2)), vscale2);
174 
175  int32x4_t rf_0{};
176  int32x4_t rf_1{};
177  int32x4_t rf_2{};
178  int32x4_t rf_3{};
179 
180 #ifdef __aarch64__
181  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
182  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
183  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
184  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
185 #else //__aarch64__
186  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
187  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
188  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
189  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
190 #endif //__aarch64__
191 
192  const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
193  const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
194  vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
195  }
196 
197  // Compute left-over elements
198  for(; x < window_end_x; ++x)
199  {
200  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
201  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
202  *(output_ptr + x) = quantize_qasymm8((afs + bfs), oq_info);
203  }
204  },
205  input1, input2, output);
206  }
207 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
uchar quantize_qasymm8(float input, float offset, float scale)
Quantize a floating-point scalar value to 8-bit asymmetric.
Definition: helpers_asymm.h:47
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ add_qasymm8_signed_neon()

void add_qasymm8_signed_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qasymm8_signed.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm8_signed(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 16;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
57  const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
58 
59  if(is_broadcast_across_x)
60  {
61  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
62  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
63  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
64  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
65  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
66  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
67  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
68 
69  const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
70  const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
71  const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
72  const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
85 
86  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
87  const int8x16_t broadcast_value_vec = vdupq_n_s8(broadcast_value);
88 
89  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
90  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(broadcast_value_vec)))), voffset2)), vscale2);
91  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
92  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(broadcast_value_vec)))), voffset2)), vscale2);
93  const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
94 
95  // Compute S elements per iteration
96  int x = window_start_x;
97  for(; x <= (window_end_x - window_step_x); x += window_step_x)
98  {
99  const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
100 
101  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
102  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
103  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
104  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
105 
106  int32x4_t rf_0{};
107  int32x4_t rf_1{};
108  int32x4_t rf_2{};
109  int32x4_t rf_3{};
110 
111 #ifdef __aarch64__
112  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
113  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
114  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
115  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
116 #else //__aarch64__
117  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
118  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
119  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
120  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
121 #endif //__aarch64__
122 
123  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
124  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
125  vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
126  }
127 
128  // Compute left-over elements
129  for(; x < window_end_x; ++x)
130  {
131  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
132  *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), oq_info);
133  }
134  },
135  broadcast_input, non_broadcast_input, output);
136  }
137  else
138  {
139  // Clear X Dimension on execution window as we handle manually
140  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
141  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
142 
143  Iterator input1(src0, input1_win);
144  Iterator input2(src1, input2_win);
145  Iterator output(dst, win);
146 
147  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
148  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
149  const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
150  const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
151  execute_window_loop(win, [&](const Coordinates &)
152  {
153  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
154  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
155  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
156 
157  // Compute S elements per iteration
158  int x = window_start_x;
159  for(; x <= (window_end_x - window_step_x); x += window_step_x)
160  {
161  const int8x16_t a = vld1q_s8(input1_ptr + x);
162  const int8x16_t b = vld1q_s8(input2_ptr + x);
163 
164  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
165  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(a)))), voffset1)), vscale1);
166  const auto af_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
167  const auto af_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(a)))), voffset1)), vscale1);
168 
169  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
170  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(b)))), voffset2)), vscale2);
171  const auto bf_2 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
172  const auto bf_3 = vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(b)))), voffset2)), vscale2);
173 
174  int32x4_t rf_0{};
175  int32x4_t rf_1{};
176  int32x4_t rf_2{};
177  int32x4_t rf_3{};
178 
179 #ifdef __aarch64__
180  rf_0 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
181  rf_1 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
182  rf_2 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
183  rf_3 = vcvtnq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
184 #else //__aarch64__
185  rf_0 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_0, bf_0), invvscaleo));
186  rf_1 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_1, bf_1), invvscaleo));
187  rf_2 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_2, bf_2), invvscaleo));
188  rf_3 = vcvtq_s32_f32(vmlaq_f32(voffseto, vaddq_f32(af_3, bf_3), invvscaleo));
189 #endif //__aarch64__
190 
191  const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
192  const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
193  vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
194  }
195 
196  // Compute left-over elements
197  for(; x < window_end_x; ++x)
198  {
199  const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;
200  const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;
201  *(output_ptr + x) = quantize_qasymm8_signed((afs + bfs), dst->info()->quantization_info());
202  }
203  },
204  input1, input2, output);
205  }
206 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ add_qasymm8_signed_sve2()

void add_qasymm8_signed_sve2 ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 37 of file qasymm8_signed.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

38 {
39  ARM_COMPUTE_UNUSED(policy);
40 
41  // Create input windows
42  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
43  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
44 
45  // Clear X Dimension on execution window as we handle manually
46  Window win = window;
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  const auto window_start_x = static_cast<int>(window.x().start());
50  const auto window_end_x = static_cast<int>(window.x().end());
51  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
52 
53  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
54  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
55  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
56 
57  const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
58  const auto voffseto = svdup_n_f32(oq_info.offset);
59 
60  if(is_broadcast_across_x)
61  {
62  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
63  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
64  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
65  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
66  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
67  const auto all_true_pg = svptrue_b8();
68 
69  const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
70  const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
71  const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
72  const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
85 
86  const int8_t broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
87  const auto broadcast_value_vec = svdup_n_s8(broadcast_value);
88 
89  int x = window_start_x;
90  svbool_t pg = svwhilelt_b8(x, window_end_x);
91  const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
92  const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
93  const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
94  const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
95 
96  do
97  {
98  const auto a = svld1_s8(pg, non_broadcast_input_ptr + x);
99  const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
100  const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
101  const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
102  const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
103 
104  const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
105  const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
106  const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
107  const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
108 
109  const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
110  const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
111  const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
112 
113  svst1_s8(pg, output_ptr + x, res);
114 
115  x += svcntb();
116  pg = svwhilelt_b8(x, window_end_x);
117  }
118  while(svptest_any(all_true_pg, pg));
119  },
120  broadcast_input, non_broadcast_input, output);
121  }
122  else
123  {
124  // Clear X Dimension on execution window as we handle manually
125  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
126  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
127 
128  Iterator input1(src0, input1_win);
129  Iterator input2(src1, input2_win);
130  Iterator output(dst, win);
131 
132  const auto vscale1 = svdup_n_f32(iq1_info.scale);
133  const auto vscale2 = svdup_n_f32(iq2_info.scale);
134  const auto voffset1 = svdup_n_s32(iq1_info.offset);
135  const auto voffset2 = svdup_n_s32(iq2_info.offset);
136 
137  execute_window_loop(win, [&](const Coordinates &)
138  {
139  const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
140  const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
141  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
142 
143  int x = window_start_x;
144  svbool_t pg = svwhilelt_b8(x, window_end_x);
145  do
146  {
147  const auto a = svld1_s8(pg, input1_ptr + x);
148  const auto b = svld1_s8(pg, input2_ptr + x);
149 
150  const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
151  const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
152  const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
153  const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
154 
155  const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
156  const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
157  const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
158  const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
159 
160  const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
161  const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
162  const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
163  const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
164 
165  const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
166  const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
167  const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
168 
169  svst1_s8(pg, output_ptr + x, res);
170 
171  x += svcntb();
172  pg = svwhilelt_b8(x, window_end_x);
173  }
174  while(svptest_any(svptrue_b8(), pg));
175  },
176  input1, input2, output);
177  }
178 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ add_qasymm8_sve2()

void add_qasymm8_sve2 ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 37 of file qasymm8.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), UniformQuantizationInfo::offset, Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

38 {
39  ARM_COMPUTE_UNUSED(policy);
40 
41  // Create input windows
42  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
43  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
44 
45  // Clear X Dimension on execution window as we handle manually
46  Window win = window;
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  const auto window_start_x = static_cast<int>(window.x().start());
50  const auto window_end_x = static_cast<int>(window.x().end());
51  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
52  const auto all_true_pg = svptrue_b8();
53 
54  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
55  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
56  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
57 
58  const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
59  const auto voffseto = svdup_n_f32(oq_info.offset);
60 
61  if(is_broadcast_across_x)
62  {
63  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
64  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
65  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
66  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
67  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
68 
69  const svfloat32_t vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.scale) : svdup_n_f32(iq2_info.scale);
70  const svfloat32_t vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.scale) : svdup_n_f32(iq1_info.scale);
71  const svint32_t voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.offset) : svdup_n_s32(iq2_info.offset);
72  const svint32_t voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.offset) : svdup_n_s32(iq1_info.offset);
73 
74  // Clear X Dimension on execution window as we handle manually
75  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
76 
77  Iterator broadcast_input(broadcast_tensor, broadcast_win);
78  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
79  Iterator output(dst, win);
80 
81  execute_window_loop(win, [&](const Coordinates &)
82  {
83  const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
84  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
85 
86  const uint8_t broadcast_value = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
87  const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
88 
89  int x = window_start_x;
90  svbool_t pg = svwhilelt_b8(x, window_end_x);
91 
92  const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
93  const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
94  const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
95  const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
96 
97  do
98  {
99  const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
100 
101  const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
102  const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
103  const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
104  const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
105 
106  const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
107  const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
108  const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
109  const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
110 
111  const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
112  const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
113 
114  const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
115  svst1_u8(pg, output_ptr + x, res);
116 
117  x += svcntb();
118  pg = svwhilelt_b8(x, window_end_x);
119  }
120  while(svptest_any(all_true_pg, pg));
121  },
122  broadcast_input, non_broadcast_input, output);
123  }
124  else
125  {
126  // Clear X Dimension on execution window as we handle manually
127  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
128  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
129 
130  Iterator input1(src0, input1_win);
131  Iterator input2(src1, input2_win);
132  Iterator output(dst, win);
133 
134  const auto vscale1 = svdup_n_f32(iq1_info.scale);
135  const auto vscale2 = svdup_n_f32(iq2_info.scale);
136  const auto voffset1 = svdup_n_s32(iq1_info.offset);
137  const auto voffset2 = svdup_n_s32(iq2_info.offset);
138 
139  execute_window_loop(win, [&](const Coordinates &)
140  {
141  const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
142  const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
143  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
144 
145  int x = window_start_x;
146  svbool_t pg = svwhilelt_b8(x, window_end_x);
147  do
148  {
149  const auto a = svld1_u8(pg, input1_ptr + x);
150  const auto b = svld1_u8(pg, input2_ptr + x);
151  const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
152  const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
153  const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
154  const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
155 
156  const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
157  const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
158  const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
159  const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
160 
161  const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
162  const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
163  const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
164  const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
165 
166  const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
167  const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
168  const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
169 
170  svst1_u8(pg, output_ptr + x, res);
171 
172  x += svcntb();
173  pg = svwhilelt_b8(x, window_end_x);
174  }
175  while(svptest_any(all_true_pg, pg));
176  },
177  input1, input2, output);
178  }
179 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ add_qsymm16_neon()

void add_qsymm16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 35 of file qsymm16.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::graph::bfs(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qsymm16(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

36 {
37  ARM_COMPUTE_UNUSED(policy);
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  const int window_step_x = 8;
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
53  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
54  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
55 
56  const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
57  const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
58  const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
59 
60  if(is_broadcast_across_x)
61  {
62  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
63  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
64  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
65  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
66  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
67  const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
68  const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
69 
70  // Clear X Dimension on execution window as we handle manually
71  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
72 
73  Iterator broadcast_input(broadcast_tensor, broadcast_win);
74  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
75  Iterator output(dst, win);
76 
77  execute_window_loop(win, [&](const Coordinates &)
78  {
79  const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
80  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
81 
82  const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
83  const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
84 
85  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
86  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
87  const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
88 
89  // Compute S elements per iteration
90  int x = window_start_x;
91  for(; x <= (window_end_x - window_step_x); x += window_step_x)
92  {
93  const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
94  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
95  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
96 
97  int32x4_t rf_0{};
98  int32x4_t rf_1{};
99 #ifdef __aarch64__
100  rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
101  rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
102 #else //__aarch64__
103  rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
104  rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
105 #endif //__aarch64__
106 
107  const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
108  vst1q_s16(output_ptr + x, pa);
109  }
110 
111  // Compute left-over elements
112  for(; x < window_end_x; ++x)
113  {
114  const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
115  *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
116  }
117  },
118  broadcast_input, non_broadcast_input, output);
119  }
120  else
121  {
122  // Clear X Dimension on execution window as we handle manually
123  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
124  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
125 
126  Iterator input1(src0, input1_win);
127  Iterator input2(src1, input2_win);
128  Iterator output(dst, win);
129 
130  execute_window_loop(win, [&](const Coordinates &)
131  {
132  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
133  const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
134  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
135 
136  // Compute S elements per iteration
137  int x = window_start_x;
138  for(; x <= (window_end_x - window_step_x); x += window_step_x)
139  {
140  const int16x8_t a = vld1q_s16(input1_ptr + x);
141  const int16x8_t b = vld1q_s16(input2_ptr + x);
142 
143  const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
144  const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
145  const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
146  const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
147 
148  int32x4_t rf_0{};
149  int32x4_t rf_1{};
150 #ifdef __aarch64__
151  rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
152  rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
153 #else //__aarch64__
154  rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
155  rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
156 #endif //__aarch64__
157 
158  const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
159  vst1q_s16(output_ptr + x, pa);
160  }
161 
162  // Compute left-over elements
163  for(; x < window_end_x; ++x)
164  {
165  const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
166  const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
167  *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
168  }
169  },
170  input1, input2, output);
171  }
172 }
int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit symmetric quantization scheme.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
std::vector< NodeID > bfs(Graph &g)
Breadth first search traversal.

◆ add_qsymm16_sve2()

void add_qsymm16_sve2 ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 37 of file qsymm16.cpp.

References ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), ITensorInfo::quantization_info(), UniformQuantizationInfo::scale, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), Dimensions< T >::x(), and Window::x().

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

38 {
39  ARM_COMPUTE_UNUSED(policy);
40 
41  // Create input windows
42  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
43  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
44 
45  // Clear X Dimension on execution window as we handle manually
46  Window win = window;
47  win.set(Window::DimX, Window::Dimension(0, 1, 1));
48 
49  const auto window_start_x = static_cast<int>(window.x().start());
50  const auto window_end_x = static_cast<int>(window.x().end());
51  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
52 
53  const UniformQuantizationInfo iq1_info = src0->info()->quantization_info().uniform();
54  const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
55  const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
56 
57  const auto vscale1 = svdup_n_f32(iq1_info.scale);
58  const auto vscale2 = svdup_n_f32(iq2_info.scale);
59  const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
60  const auto all_true_pg = svptrue_b16();
61 
62  if(is_broadcast_across_x)
63  {
64  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
65  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
66  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
67  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
68  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
69 
70  // Clear X Dimension on execution window as we handle manually
71  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
72 
73  Iterator broadcast_input(broadcast_tensor, broadcast_win);
74  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
75  Iterator output(dst, win);
76 
77  execute_window_loop(win, [&](const Coordinates &)
78  {
79  const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
80  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
81 
82  const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
83  const auto broadcast_value_vec = svdup_n_s16(broadcast_value);
84 
85  int x = window_start_x;
86  svbool_t pg = svwhilelt_b16(x, window_end_x);
87 
88  const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
89  const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
90 
91  do
92  {
93  const auto a = svld1_s16(pg, non_broadcast_input_ptr + x);
94  const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
95  const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
96 
97  const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
98  const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
99 
100  const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
101 
102  svst1_s16(pg, output_ptr + x, res);
103 
104  x += svcnth();
105  pg = svwhilelt_b16(x, window_end_x);
106  }
107  while(svptest_any(all_true_pg, pg));
108  },
109  broadcast_input, non_broadcast_input, output);
110  }
111  else
112  {
113  // Clear X Dimension on execution window as we handle manually
114  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
115  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
116 
117  Iterator input1(src0, input1_win);
118  Iterator input2(src1, input2_win);
119  Iterator output(dst, win);
120 
121  execute_window_loop(win, [&](const Coordinates &)
122  {
123  const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
124  const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
125  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
126 
127  int x = window_start_x;
128  svbool_t pg = svwhilelt_b16(x, window_end_x);
129  do
130  {
131  auto a = svld1_s16(pg, input1_ptr + x);
132  auto b = svld1_s16(pg, input2_ptr + x);
133 
134  const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
135  const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
136 
137  const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
138  const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
139 
140  const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
141  const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
142 
143  const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
144  svst1_s16(pg, output_ptr + x, res);
145 
146  x += svcnth();
147  pg = svwhilelt_b16(x, window_end_x);
148  }
149  while(svptest_any(all_true_pg, pg));
150  },
151  input1, input2, output);
152  }
153 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ add_s16_neon()

void add_s16_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 36 of file integer.cpp.

References add_same_neon< int16_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

37 {
38  return add_same_neon<int16_t>(src0, src1, dst, policy, window);
39 }
template void add_same_neon< int16_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_s16_neon_as_1d_array()

void add_s16_neon_as_1d_array ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 51 of file integer.cpp.

References add_same_neon_as_1d_array< int16_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

52 {
53  return add_same_neon_as_1d_array<int16_t>(src0, src1, dst, policy, window);
54 }
template void add_same_neon_as_1d_array< int16_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_s16_sve()

void add_s16_sve ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 38 of file integer.cpp.

References add_same_sve< int16_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

39 {
40  return add_same_sve<int16_t>(src0, src1, dst, policy, window);
41 }
template void add_same_sve< int16_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_s32_neon()

void add_s32_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 41 of file integer.cpp.

References add_same_neon< int32_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

42 {
43  return add_same_neon<int32_t>(src0, src1, dst, policy, window);
44 }
template void add_same_neon< int32_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_s32_neon_as_1d_array()

void add_s32_neon_as_1d_array ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 56 of file integer.cpp.

References add_same_neon_as_1d_array< int32_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

57 {
58  return add_same_neon_as_1d_array<int32_t>(src0, src1, dst, policy, window);
59 }
template void add_same_neon_as_1d_array< int32_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_s32_sve()

void add_s32_sve ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 43 of file integer.cpp.

References add_same_sve< int32_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

44 {
45  return add_same_sve<int32_t>(src0, src1, dst, policy, window);
46 }
template void add_same_sve< int32_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_same_neon()

void add_same_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

SIMD vector tag type.

Definition at line 34 of file impl.cpp.

References arm_compute::wrapper::add_sat(), Window::broadcast_if_dimension_le_one(), Window::DimX, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), arm_compute::SATURATE, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vqadd(), arm_compute::wrapper::vstore(), Dimensions< T >::x(), and Window::x().

35 {
36  /** SIMD vector tag type. */
37  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
38 
39  // Create input windows
40  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
42 
43  // Clear X Dimension on execution window as we handle manually
44  Window win = window;
45  win.set(Window::DimX, Window::Dimension(0, 1, 1));
46 
47  constexpr int window_step_x = 16 / sizeof(ScalarType);
48  const auto window_start_x = static_cast<int>(window.x().start());
49  const auto window_end_x = static_cast<int>(window.x().end());
50  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
51 
52  if(is_broadcast_across_x)
53  {
54  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
55  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
56  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
57  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
58  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
59 
60  // Clear X Dimension on execution window as we handle manually
61  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
62 
63  Iterator broadcast_input(broadcast_tensor, broadcast_win);
64  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
65  Iterator output(dst, win);
66 
67  execute_window_loop(win, [&](const Coordinates &)
68  {
69  const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
70  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
71 
72  const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
73  const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
74 
75  // Compute S elements per iteration
76  int x = window_start_x;
77  for(; x <= (window_end_x - window_step_x); x += window_step_x)
78  {
79  const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
80  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
81  wrapper::vstore(output_ptr + x, res);
82  }
83 
84  // Compute left-over elements
85  for(; x < window_end_x; ++x)
86  {
87  const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
88  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
89  }
90  },
91  broadcast_input, non_broadcast_input, output);
92  }
93  else
94  {
95  // Clear X Dimension on execution window as we handle manually
96  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
97  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
98 
99  Iterator input1(src0, input1_win);
100  Iterator input2(src1, input2_win);
101  Iterator output(dst, win);
102 
103  execute_window_loop(win, [&](const Coordinates &)
104  {
105  const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
106  const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
107  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
108 
109  // Compute S elements per iteration
110  int x = window_start_x;
111  for(; x <= (window_end_x - window_step_x); x += window_step_x)
112  {
113  const auto val1 = wrapper::vloadq(input1_ptr + x);
114  const auto val2 = wrapper::vloadq(input2_ptr + x);
115  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
116  wrapper::vstore(output_ptr + x, res);
117  }
118 
119  // Compute left-over elements
120  for(; x < window_end_x; ++x)
121  {
122  const auto val1 = *(input1_ptr + x);
123  const auto val2 = *(input2_ptr + x);
124  *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
125  }
126  },
127  input1, input2, output);
128  }
129 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ add_same_neon< float >()

template void arm_compute::cpu::add_same_neon< float > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon< int16_t >()

template void arm_compute::cpu::add_same_neon< int16_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon< int32_t >()

template void arm_compute::cpu::add_same_neon< int32_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon< uint8_t >()

template void arm_compute::cpu::add_same_neon< uint8_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon_as_1d_array()

void add_same_neon_as_1d_array ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 132 of file impl.cpp.

References add_same_neon< float >(), add_same_neon< int16_t >(), add_same_neon< int32_t >(), add_same_neon< uint8_t >(), add_same_neon_as_1d_array< float >(), add_same_neon_as_1d_array< int16_t >(), add_same_neon_as_1d_array< int32_t >(), add_same_neon_as_1d_array< uint8_t >(), arm_compute::wrapper::add_sat(), ITensor::buffer(), arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::SATURATE, Window::Dimension::start(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vqadd(), arm_compute::wrapper::vstore(), and Window::x().

133 {
134  const ScalarType *src0_ptr = reinterpret_cast<const ScalarType *>(src0->buffer());
135  const ScalarType *src1_ptr = reinterpret_cast<const ScalarType *>(src1->buffer());
136  ScalarType *dst_ptr = reinterpret_cast<ScalarType *>(dst->buffer());
137 
138  constexpr int window_step_x = 16 / sizeof(ScalarType);
139  const auto window_start_x = static_cast<int>(window.x().start());
140  const auto window_end_x = static_cast<int>(window.x().end());
141 
142  int x = window_start_x;
143  for(; x <= (window_end_x - window_step_x); x += window_step_x)
144  {
145  const auto val1 = wrapper::vloadq(src0_ptr + x);
146  const auto val2 = wrapper::vloadq(src1_ptr + x);
147  const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
148  wrapper::vstore(dst_ptr + x, res);
149  }
150 
151  // Compute left-over elements
152  for(; x < window_end_x; ++x)
153  {
154  const auto val1 = *(src0_ptr + x);
155  const auto val2 = *(src1_ptr + x);
156  *(dst_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
157  }
158 }
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
uint8_t add_sat(const uint8_t &a, const uint8_t &b)
Definition: add.h:33
uint8x8_t vqadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:73
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

◆ add_same_neon_as_1d_array< float >()

template void arm_compute::cpu::add_same_neon_as_1d_array< float > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon_as_1d_array< int16_t >()

template void arm_compute::cpu::add_same_neon_as_1d_array< int16_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon_as_1d_array< int32_t >()

template void arm_compute::cpu::add_same_neon_as_1d_array< int32_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_neon_as_1d_array< uint8_t >()

template void arm_compute::cpu::add_same_neon_as_1d_array< uint8_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

◆ add_same_sve()

void add_same_sve ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 36 of file impl.cpp.

References add_same_sve< float >(), add_same_sve< int16_t >(), add_same_sve< int32_t >(), add_same_sve< uint8_t >(), Window::broadcast_if_dimension_le_one(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), ITensor::info(), Iterator::ptr(), arm_compute::SATURATE, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::tensor_shape(), Dimensions< T >::x(), and Window::x().

37 {
38  const auto all_true_pg = wrapper::svptrue<ScalarType>();
39  const auto window_start_x = static_cast<int>(window.x().start());
40  const auto window_end_x = static_cast<int>(window.x().end());
41  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
42  const bool is_sat = (policy == ConvertPolicy::SATURATE);
43 
44  // Clear X Dimension on execution window as we handle manually
45  Window win = window;
46  win.set(Window::DimX, Window::Dimension(0, 1, 1));
47 
48  // Create input windows
49  Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
50  Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
51 
52  Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
53  Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
54  Iterator output(dst, window);
55 
56  if(is_broadcast_across_x)
57  {
58  const bool is_broadcast_input_2 = input2_win.x().step() == 0;
59  Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
60  Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
61  const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
62  const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
63 
64  // Clear X Dimension on execution window as we handle manually
65  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
66 
67  Iterator broadcast_input(broadcast_tensor, broadcast_win);
68  Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
69  Iterator output(dst, win);
70 
71  execute_window_loop(win, [&](const Coordinates &)
72  {
73  const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
74  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
75 
76  const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
77  const auto broadcast_value_vec = wrapper::svdup_n(broadcast_value);
78 
79  int x = window_start_x;
80  svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
81  do
82  {
83  const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
84  auto res = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
85  svst1(pg, output_ptr + x, res);
86 
87  x += wrapper::svcnt<ScalarType>();
88  pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
89  }
90  while(svptest_any(all_true_pg, pg));
91  },
92  broadcast_input, non_broadcast_input, output);
93  }
94  else
95  {
96  // Clear X Dimension on execution window as we handle manually
97  input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
98  input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
99 
100  Iterator input1(src0, input1_win);
101  Iterator input2(src1, input2_win);
102  Iterator output(dst, win);
103 
104  execute_window_loop(win, [&](const Coordinates &)
105  {
106  const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
107  const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
108  const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
109 
110  int x = window_start_x;
111  svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
112  do
113  {
114  const auto val1 = svld1(pg, input1_ptr + x);
115  const auto val2 = svld1(pg, input2_ptr + x);
116  const auto res = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
117  svst1(pg, output_ptr + x, res);
118 
119  x += wrapper::svcnt<ScalarType>();
120  pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
121  }
122  while(svptest_any(all_true_pg, pg));
123  },
124  input1, input2, output);
125  }
126 }
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ add_same_sve< float >()

template void arm_compute::cpu::add_same_sve< float > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Referenced by add_fp32_sve(), and add_same_sve().

◆ add_same_sve< int16_t >()

template void arm_compute::cpu::add_same_sve< int16_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Referenced by add_s16_sve(), and add_same_sve().

◆ add_same_sve< int32_t >()

template void arm_compute::cpu::add_same_sve< int32_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Referenced by add_s32_sve(), and add_same_sve().

◆ add_same_sve< uint8_t >()

template void arm_compute::cpu::add_same_sve< uint8_t > ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Referenced by add_same_sve(), and add_u8_sve().

◆ add_u8_neon()

void add_u8_neon ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 31 of file integer.cpp.

References add_same_neon< uint8_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

32 {
33  return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
34 }
template void add_same_neon< uint8_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_u8_neon_as_1d_array()

void add_u8_neon_as_1d_array ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 46 of file integer.cpp.

References add_same_neon_as_1d_array< uint8_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

47 {
48  return add_same_neon_as_1d_array<uint8_t>(src0, src1, dst, policy, window);
49 }
template void add_same_neon_as_1d_array< uint8_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ add_u8_sve()

void add_u8_sve ( const ITensor src0,
const ITensor src1,
ITensor dst,
const ConvertPolicy policy,
const Window window 
)

Definition at line 33 of file integer.cpp.

References add_same_sve< uint8_t >(), and arm_compute::test::validation::dst.

Referenced by arm_compute::cpu::kernels::can_interpret_inputs_as_1d_array().

34 {
35  return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
36 }
template void add_same_sve< uint8_t >(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)

◆ avg_poolingMxNxD_q8_neon_ndhwc()

void arm_compute::cpu::avg_poolingMxNxD_q8_neon_ndhwc ( const ITensor src,
ITensor dst0,
Pooling3dLayerInfo pool_info,
const Window window_out,
const int  window_step_x 
)

Definition at line 38 of file quantized.h.

References Padding3D::back, Padding3D::bottom, ITensor::buffer(), Size3D::depth, ITensorInfo::dimension(), Pooling3dLayerInfo::exclude_padding, arm_compute::execute_window_loop(), Padding3D::front, Size3D::height, ITensor::info(), Pooling3dLayerInfo::is_global_pooling, Padding3D::left, UniformQuantizationInfo::offset, ITensorInfo::offset_first_element_in_bytes(), Pooling3dLayerInfo::padding, Pooling3dLayerInfo::pool_size, Iterator::ptr(), ITensorInfo::quantization_info(), Padding3D::right, UniformQuantizationInfo::scale, arm_compute::test::validation::scale, Pooling3dLayerInfo::stride, ITensorInfo::strides_in_bytes(), ITensorInfo::tensor_shape(), Padding3D::top, QuantizationInfo::uniform(), arm_compute::wrapper::vadd(), arm_compute::wrapper::vcombine(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vmovn(), arm_compute::wrapper::vstore(), Size3D::width, Dimensions< T >::y(), and Dimensions< T >::z().

41 {
42  using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type;
43  using q8x16_t = typename wrapper::traits::neon_vector<T, 16>::type;
44  using q16_t = typename wrapper::traits::promote_t<T>;
45  using q16x8_t = typename wrapper::traits::neon_vector<q16_t, 8>::type;
46  using q32_t = typename wrapper::traits::promote_t<q16_t>;
47  using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
48 
49  int pool_stride_x = static_cast<int>(pool_info.stride.width);
50  int pool_stride_y = static_cast<int>(pool_info.stride.height);
51  int pool_stride_z = static_cast<int>(pool_info.stride.depth);
52 
53  const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
54  const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
55  const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
56 
57  const int pool_pad_top = static_cast<int>(pool_info.padding.top);
58  const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
59  const int pool_pad_left = static_cast<int>(pool_info.padding.left);
60  const int pool_pad_right = static_cast<int>(pool_info.padding.right);
61  const int pool_pad_front = static_cast<int>(pool_info.padding.front);
62  const int pool_pad_back = static_cast<int>(pool_info.padding.back);
63 
64  const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
65  const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
66  const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
67 
68  const int input_dim_c = src->info()->dimension(0);
69  const int input_dim_w = src->info()->dimension(1);
70  const int input_dim_h = src->info()->dimension(2);
71  const int input_dim_d = src->info()->dimension(3);
72 
73  const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
74  const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
75  const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
76  const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
77 
78  const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
79 
80  const int window_end_x = input_dim_c;
81  const int window_start_x = 0;
82 
83  Iterator out(dst0, window_out);
84 
85  const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
86  const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
87  const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
88 
89  const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
90  // "new_offset" doesn't have to consider the "half_scale_v" in its computation
91  // With a requantization performed in a single step there won't be uncertainties introduced
92  const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
93 
94  execute_window_loop(window_out, [&](const Coordinates & id)
95  {
96  // Computing the theoretical input starting/ending points
97  const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
98  const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
99  const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
100 
101  const int pool_start_x = std::max(0, -in_idx_width);
102  const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
103  const int pool_start_y = std::max(0, -in_idx_height);
104  const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
105 
106  const int pool_start_z = std::max(0, -in_idx_depth);
107  const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
108 
109  // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
110  const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
111  const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
112  const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
113 
114  // Calculate scale
115  const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
116  pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
117 
118  const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
119 
120  int x_off = window_start_x;
121 
122  for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
123  {
124  q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
125  q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
126  q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
127  q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
128 
129  // Perform pooling
130  for(int z = pool_start_z; z < pool_end_z; ++z)
131  {
132  const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
133  for(int y = pool_start_y; y < pool_end_y; ++y)
134  {
135  const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
136  for(int x = pool_start_x; x < pool_end_x; ++x)
137  {
138  const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
139  const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
140 
141  const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data));
142  const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
143  vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
144  vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
145  vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
146  vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
147  }
148  }
149  }
150 
151  if(src_qinfo != dst_qinfo)
152  {
153  const float32x4x4_t vres =
154  {
155  {
156  vcvtq_f32_q32(vres1),
157  vcvtq_f32_q32(vres2),
158  vcvtq_f32_q32(vres3),
159  vcvtq_f32_q32(vres4),
160  }
161  };
162  const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
163  // Store result
164  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
165  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
166  }
167  else
168  {
169  const float32x4_t scale_v = vdupq_n_f32(scale);
170  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
171  vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
172  vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
173  vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
174  vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
175 
176  const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
177  const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
178  // Store result
179  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
180  wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
181  }
182  }
183 
184  // Left-overs loop
185  for(; x_off < window_end_x; ++x_off)
186  {
187  q32_t res = static_cast<q32_t>(0.f);
188 
189  // Perform pooling
190  for(int z = pool_start_z; z < pool_end_z; ++z)
191  {
192  const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
193  for(int y = pool_start_y; y < pool_end_y; ++y)
194  {
195  const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
196  for(int x = pool_start_x; x < pool_end_x; ++x)
197  {
198  const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
199  const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
200  res += data;
201  }
202  }
203  }
204 
205  if(src_qinfo != dst_qinfo)
206  {
207  const float res_f = static_cast<float>(res);
208  const float new_scale = quant_rescale / scale;
209  const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
210 
211  // Store result
212  *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
213  }
214  else
215  {
216  // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
217  res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
218 
219  // Store result
220  *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
221  }
222  }
223  },
224  out);
225 }
uint32x2_t vmovn(const uint64x2_t &a)
Definition: movn.h:39
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Definition: DFT.cpp:155
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
Definition: combine.h:39
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39

◆ bilinear_neon_scale()

void arm_compute::cpu::bilinear_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 143 of file list.h.

References ARM_COMPUTE_ERROR, ARM_COMPUTE_UNUSED, arm_compute::test::validation::b, arm_compute::scale_utils::calculate_resize_ratio(), arm_compute::CONSTANT, ITensorInfo::dimension(), Window::DimX, Window::DimY, Window::DimZ, Window::Dimension::end(), PixelValue::get(), ITensor::info(), Iterator::ptr(), arm_compute::REPLICATE, arm_compute::test::validation::scale_x, arm_compute::test::validation::scale_y, Window::set(), Window::Dimension::start(), Window::Dimension::step(), ITensorInfo::strides_in_bytes(), type, arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vstore(), Window::y(), and Window::z().

146 {
147  ARM_COMPUTE_UNUSED(offsets);
148  ARM_COMPUTE_UNUSED(dx);
149  ARM_COMPUTE_UNUSED(dy);
150  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
151 
152  // Compute the ratio between source and destination dimensions
153  const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
154  const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
155 
156  const int in_stride_y = src->info()->strides_in_bytes()[1];
157  const int in_stride_z = src->info()->strides_in_bytes()[2];
158  const int in_stride_w = src->info()->strides_in_bytes()[3];
159  const int out_stride_y = dst->info()->strides_in_bytes()[1];
160  const int out_stride_z = dst->info()->strides_in_bytes()[2];
161  const int out_stride_w = dst->info()->strides_in_bytes()[3];
162  const int in_dim_w = src->info()->dimension(1);
163  const int in_dim_h = src->info()->dimension(2);
164  const int out_dim_ch = dst->info()->dimension(0);
165  const int step_cout = 16 / sizeof(T);
166 
167  Window window_execution = window;
168  window_execution.set(Window::DimX, Window::Dimension(0, 1, 1));
169  Window win_in_out(window);
170  win_in_out.set(Window::DimY, Window::Dimension(0, 0, 0));
171  win_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
172  Iterator in(src, win_in_out);
173  Iterator out(dst, win_in_out);
174 
175  const int xo_start = window_execution.y().start();
176  const int xo_end = window_execution.y().end();
177  const int xo_step = window_execution.y().step();
178  const int yo_start = window_execution.z().start();
179  const int yo_end = window_execution.z().end();
180  const int yo_step = window_execution.z().step();
181  const int bo_start = window_execution[3].start();
182  const int bo_end = window_execution[3].end();
183  const int bo_step = window_execution[3].step();
184 
185  if(border_mode == BorderMode::CONSTANT)
186  {
187 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
188  using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
189 #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
190  using ConstType = T;
191 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
192  const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
193 
194  for(int bo = bo_start; bo < bo_end; bo += bo_step)
195  {
196  const uint8_t *in_ptr_base = in.ptr() + bo * in_stride_w;
197  uint8_t *out_ptr_base = out.ptr() + bo * out_stride_w;
198 
199  for(int yo = yo_start; yo < yo_end; yo += yo_step)
200  {
201  // Floating-point coordinate
202  const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
203  // Integer coordinate
204  const auto yi = static_cast<int>(std::floor(yi_f));
205  // Weight for the y coordinate
206  const auto a1 = (yi_f - static_cast<float>(yi));
207  const auto b1 = (1.f - a1);
208 
209  for(int xo = xo_start; xo < xo_end; xo += xo_step)
210  {
211  // Floating-point coordinate
212  const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
213  // Integer coordinate
214  const auto xi = static_cast<int>(std::floor(xi_f));
215  // Weight for the x coordinate
216  const auto a = (xi_f - static_cast<float>(xi));
217  const auto b = (1.f - a);
218 
219  const auto s00_s = static_cast<T>(b * b1);
220  const auto s01_s = static_cast<T>(a * b1);
221  const auto s10_s = static_cast<T>(b * a1);
222  const auto s11_s = static_cast<T>(a * a1);
223 
224  const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
225  uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
226 
227  int cout = 0;
228  for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
229  {
230  auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
231  auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
232  auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
233  auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
234  if((yi >= 0) && (yi < in_dim_h))
235  {
236  if((xi >= 0) && (xi < in_dim_w))
237  {
238  in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
239  }
240  if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
241  {
242  in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
243  }
244  }
245  if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
246  {
247  if((xi >= 0) && (xi < in_dim_w))
248  {
249  in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
250  }
251  if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
252  {
253  in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
254  }
255  }
256 
257  const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
258  const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
259  const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
260  const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
261  auto out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
262  out0 = wrapper::vmla(out0, in00, s00);
263  out0 = wrapper::vmla(out0, in01, s01);
264  out0 = wrapper::vmla(out0, in10, s10);
265  out0 = wrapper::vmla(out0, in11, s11);
266  wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
267  }
268 
269  for(; cout < out_dim_ch; ++cout)
270  {
271  auto in00 = static_cast<T>(const_border_value);
272  auto in01 = static_cast<T>(const_border_value);
273  auto in10 = static_cast<T>(const_border_value);
274  auto in11 = static_cast<T>(const_border_value);
275  if((yi >= 0) && (yi < in_dim_h))
276  {
277  if((xi >= 0) && (xi < in_dim_w))
278  {
279  in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
280  }
281  if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
282  {
283  in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
284  }
285  }
286  if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
287  {
288  if((xi >= 0) && (xi < in_dim_w))
289  {
290  in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
291  }
292  if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
293  {
294  in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
295  }
296  }
297  auto out0 = static_cast<T>(0);
298  out0 += in00 * s00_s;
299  out0 += in01 * s01_s;
300  out0 += in10 * s10_s;
301  out0 += in11 * s11_s;
302  *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
303  }
304  }
305  }
306  }
307  }
308  else if(border_mode == BorderMode::REPLICATE)
309  {
310  for(int bo = bo_start; bo < bo_end; bo += bo_step)
311  {
312  const uint8_t *in_ptr = in.ptr() + bo * in_stride_w;
313  uint8_t *out_ptr = out.ptr() + bo * out_stride_w;
314 
315  for(int yo = yo_start; yo < yo_end; yo += yo_step)
316  {
317  // Floating-point coordinate
318  const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
319  // Integer coordinate
320  const auto yi = static_cast<int>(std::floor(yi_f));
321  // Weight for the y coordinate
322  const auto a1 = (yi_f - static_cast<float>(yi));
323  const auto b1 = (1.f - a1);
324 
325  const auto yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1);
326  const auto yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1);
327 
328  for(int xo = xo_start; xo < xo_end; xo += xo_step)
329  {
330  // Floating-point coordinate
331  const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
332  // Integer coordinate
333  const auto xi = static_cast<int>(std::floor(xi_f));
334  // Weight for the x coordinate
335  const auto a = (xi_f - static_cast<float>(xi));
336  const auto b = (1.f - a);
337 
338  const auto s00_s = static_cast<T>(b * b1);
339  const auto s01_s = static_cast<T>(a * b1);
340  const auto s10_s = static_cast<T>(b * a1);
341  const auto s11_s = static_cast<T>(a * a1);
342 
343  const auto xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1);
344  const auto xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1);
345 
346  int cout = 0;
347  for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
348  {
349  auto in00 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
350  auto in01 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
351  auto in10 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
352  auto in11 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
353  in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi0) * in_stride_z));
354  in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi0) * in_stride_z));
355  in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi1) * in_stride_z));
356  in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi1) * in_stride_z));
357 
358  const auto s00 = wrapper::vdup_n(s00_s, ExactTagType{});
359  const auto s01 = wrapper::vdup_n(s01_s, ExactTagType{});
360  const auto s10 = wrapper::vdup_n(s10_s, ExactTagType{});
361  const auto s11 = wrapper::vdup_n(s11_s, ExactTagType{});
362  auto out0 = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
363  out0 = wrapper::vmla(out0, in00, s00);
364  out0 = wrapper::vmla(out0, in01, s01);
365  out0 = wrapper::vmla(out0, in10, s10);
366  out0 = wrapper::vmla(out0, in11, s11);
367  wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T) + xo * out_stride_y + yo * out_stride_z), out0);
368  }
369 
370  for(; cout < out_dim_ch; ++cout)
371  {
372  auto in00 = static_cast<T>(0);
373  auto in01 = static_cast<T>(0);
374  auto in10 = static_cast<T>(0);
375  auto in11 = static_cast<T>(0);
376  in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi0) * in_stride_z));
377  in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi0) * in_stride_z));
378  in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi0) * in_stride_y + (yi1) * in_stride_z));
379  in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + (xi1) * in_stride_y + (yi1) * in_stride_z));
380  auto out0 = static_cast<T>(0);
381  out0 += in00 * s00_s;
382  out0 += in01 * s01_s;
383  out0 += in10 * s10_s;
384  out0 += in11 * s11_s;
385  *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T) + xo * out_stride_y + yo * out_stride_z)) = out0;
386  }
387  }
388  }
389  }
390  }
391  else
392  {
393  ARM_COMPUTE_ERROR("Not implemented");
394  }
395 }
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
half_float::half half
16-bit floating point type
Definition: Types.h:48
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Definition: DFT.cpp:155
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners=false)
Returns resize ratio between input and output with consideration of aligned corners.
Definition: ScaleUtils.cpp:27
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46

◆ bounding_box_transform()

void bounding_box_transform ( const ITensor boxes,
ITensor pred_boxes,
const ITensor deltas,
BoundingBoxTransformInfo  bbinfo,
const Window window 
)

Definition at line 87 of file impl.cpp.

References BoundingBoxTransformInfo::apply_scale(), ARM_COMPUTE_ERROR_ON, BoundingBoxTransformInfo::bbox_xform_clip(), bounding_box_transform< float >(), ITensor::buffer(), BoundingBoxTransformInfo::correct_transform_coords(), arm_compute::execute_window_loop(), BoundingBoxTransformInfo::img_height(), BoundingBoxTransformInfo::img_width(), ITensor::info(), offset(), ITensorInfo::offset_first_element_in_bytes(), Iterator::ptr(), BoundingBoxTransformInfo::scale(), ITensorInfo::tensor_shape(), and BoundingBoxTransformInfo::weights().

88 {
89  const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2;
90  const size_t deltas_width = deltas->info()->tensor_shape()[0];
91  const int img_h = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
92  const int img_w = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
93 
94  const auto scale_after = (bbinfo.apply_scale() ? T(bbinfo.scale()) : T(1));
95  const auto scale_before = T(bbinfo.scale());
96  ARM_COMPUTE_ERROR_ON(scale_before <= 0);
97  const auto offset = (bbinfo.correct_transform_coords() ? T(1.f) : T(0.f));
98 
99  auto pred_ptr = reinterpret_cast<T *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
100  auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
101 
102  Iterator box_it(boxes, window);
103  execute_window_loop(window, [&](const Coordinates & id)
104  {
105  const auto ptr = reinterpret_cast<T *>(box_it.ptr());
106  const auto b0 = *ptr;
107  const auto b1 = *(ptr + 1);
108  const auto b2 = *(ptr + 2);
109  const auto b3 = *(ptr + 3);
110  const T width = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
111  const T height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
112  const T ctr_x = (b0 / scale_before) + T(0.5f) * width;
113  const T ctr_y = (b1 / scale_before) + T(0.5f) * height;
114  for(size_t j = 0; j < num_classes; ++j)
115  {
116  // Extract deltas
117  const size_t delta_id = id.y() * deltas_width + 4u * j;
118  const T dx = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
119  const T dy = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
120  T dw = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
121  T dh = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
122  // Clip dw and dh
123  dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
124  dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
125  // Determine the predictions
126  const T pred_ctr_x = dx * width + ctr_x;
127  const T pred_ctr_y = dy * height + ctr_y;
128  const T pred_w = std::exp(dw) * width;
129  const T pred_h = std::exp(dh) * height;
130  // Store the prediction into the output tensor
131  pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
132  pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
133  pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
134  pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
135  }
136  },
137  box_it);
138 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ bounding_box_transform< float >()

template void arm_compute::cpu::bounding_box_transform< float > ( const ITensor boxes,
ITensor pred_boxes,
const ITensor deltas,
BoundingBoxTransformInfo  bbinfo,
const Window window 
)

◆ bounding_box_transform_qsymm16()

void bounding_box_transform_qsymm16 ( const ITensor boxes,
ITensor pred_boxes,
const ITensor deltas,
BoundingBoxTransformInfo  bbinfo,
const Window window 
)

Definition at line 29 of file impl.cpp.

References BoundingBoxTransformInfo::apply_scale(), BoundingBoxTransformInfo::bbox_xform_clip(), ITensor::buffer(), BoundingBoxTransformInfo::correct_transform_coords(), arm_compute::dequantize_qasymm16(), arm_compute::dequantize_qasymm8(), arm_compute::execute_window_loop(), BoundingBoxTransformInfo::img_height(), BoundingBoxTransformInfo::img_width(), ITensor::info(), offset(), ITensorInfo::offset_first_element_in_bytes(), Iterator::ptr(), ITensorInfo::quantization_info(), arm_compute::quantize_qasymm16(), BoundingBoxTransformInfo::scale(), ITensorInfo::tensor_shape(), QuantizationInfo::uniform(), and BoundingBoxTransformInfo::weights().

Referenced by neon_qu16_boundingboxtransform().

31 {
32  const size_t num_classes = deltas->info()->tensor_shape()[0] >> 2;
33  const size_t deltas_width = deltas->info()->tensor_shape()[0];
34  const int img_h = std::floor(bbinfo.img_height() / bbinfo.scale() + 0.5f);
35  const int img_w = std::floor(bbinfo.img_width() / bbinfo.scale() + 0.5f);
36 
37  const auto scale_after = (bbinfo.apply_scale() ? bbinfo.scale() : 1.f);
38  const auto scale_before = bbinfo.scale();
39  const auto offset = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
40 
41  auto pred_ptr = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
42  auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
43 
44  const auto boxes_qinfo = boxes->info()->quantization_info().uniform();
45  const auto deltas_qinfo = deltas->info()->quantization_info().uniform();
46  const auto pred_qinfo = pred_boxes->info()->quantization_info().uniform();
47 
48  Iterator box_it(boxes, window);
49  execute_window_loop(window, [&](const Coordinates & id)
50  {
51  const auto ptr = reinterpret_cast<uint16_t *>(box_it.ptr());
52  const auto b0 = dequantize_qasymm16(*ptr, boxes_qinfo);
53  const auto b1 = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
54  const auto b2 = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
55  const auto b3 = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
56  const float width = (b2 / scale_before) - (b0 / scale_before) + 1.f;
57  const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
58  const float ctr_x = (b0 / scale_before) + 0.5f * width;
59  const float ctr_y = (b1 / scale_before) + 0.5f * height;
60  for(size_t j = 0; j < num_classes; ++j)
61  {
62  // Extract deltas
63  const size_t delta_id = id.y() * deltas_width + 4u * j;
64  const float dx = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
65  const float dy = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
66  float dw = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
67  float dh = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
68  // Clip dw and dh
69  dw = std::min(dw, bbinfo.bbox_xform_clip());
70  dh = std::min(dh, bbinfo.bbox_xform_clip());
71  // Determine the predictions
72  const float pred_ctr_x = dx * width + ctr_x;
73  const float pred_ctr_y = dy * height + ctr_y;
74  const float pred_w = std::exp(dw) * width;
75  const float pred_h = std::exp(dh) * height;
76  // Store the prediction into the output tensor
77  pred_ptr[delta_id] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
78  pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
79  pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
80  pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
81  }
82  },
83  box_it);
84 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
float dequantize_qasymm8(uchar input, float offset, float scale)
Dequantize a scalar value from 8-bit asymmetric to floating-point.
Definition: helpers_asymm.h:62
float dequantize_qasymm16(uint16_t value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 16-bit asymmetric quantization scheme.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
uint16_t quantize_qasymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit asymmetric quantization scheme.

◆ common_neon_scale()

void arm_compute::cpu::common_neon_scale ( const ITensor src,
ITensor dst,
const ITensor offsets,
const ITensor dx,
const ITensor dy,
InterpolationPolicy  policy,
BorderMode  border_mode,
PixelValue  constant_border_value,
float  sampling_offset,
bool  align_corners,
const Window window 
)

Definition at line 398 of file list.h.

References arm_compute::BILINEAR, arm_compute::test::validation::dst, arm_compute::NEAREST_NEIGHBOR, and arm_compute::test::validation::src.

401 {
402  if(policy == InterpolationPolicy::BILINEAR)
403  {
404  bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
405  }
406  else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
407  {
408  nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
409  }
410 }
SimpleTensor< float > src
Definition: DFT.cpp:155

◆ compute_all_anchors()

void compute_all_anchors ( const ITensor anchors,
ITensor all_anchors,
ComputeAnchorsInfo  anchors_info,
const Window window 
)

Definition at line 32 of file impl.cpp.

References compute_all_anchors< float >(), ITensorInfo::dimension(), arm_compute::execute_window_loop(), ComputeAnchorsInfo::feat_width(), ITensor::info(), Iterator::ptr(), ITensor::ptr_to_element(), ComputeAnchorsInfo::spatial_scale(), and Window::y().

33 {
34  Iterator all_anchors_it(all_anchors, window);
35  Iterator anchors_it(all_anchors, window);
36 
37  const size_t num_anchors = anchors->info()->dimension(1);
38  const T stride = 1.f / anchors_info.spatial_scale();
39  const size_t feat_width = anchors_info.feat_width();
40 
41  execute_window_loop(window, [&](const Coordinates & id)
42  {
43  const size_t anchor_offset = id.y() % num_anchors;
44 
45  const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
46  const auto anchor_ptr = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
47 
48  const size_t shift_idy = id.y() / num_anchors;
49  const T shiftx = (shift_idy % feat_width) * stride;
50  const T shifty = (shift_idy / feat_width) * stride;
51 
52  *out_anchor_ptr = *anchor_ptr + shiftx;
53  *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
54  *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
55  *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
56  },
57  all_anchors_it);
58 }
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ compute_all_anchors< float >()

template void arm_compute::cpu::compute_all_anchors< float > ( const ITensor anchors,
ITensor all_anchors,
ComputeAnchorsInfo  anchors_info,
const Window window 
)

◆ compute_all_anchors_qasymm16()

void compute_all_anchors_qasymm16 ( const ITensor anchors,
ITensor all_anchors,
ComputeAnchorsInfo  anchors_info,
const Window window 
)

Definition at line 65 of file impl.cpp.

References arm_compute::dequantize_qsymm16(), ITensorInfo::dimension(), arm_compute::execute_window_loop(), ComputeAnchorsInfo::feat_width(), ITensor::info(), Iterator::ptr(), ITensor::ptr_to_element(), arm_compute::test::validation::qinfo, ITensorInfo::quantization_info(), arm_compute::quantize_qsymm16(), UniformQuantizationInfo::scale, ComputeAnchorsInfo::spatial_scale(), QuantizationInfo::uniform(), and Window::y().

Referenced by neon_qu16_computeallanchors().

66 {
67  Iterator all_anchors_it(all_anchors, window);
68  Iterator anchors_it(all_anchors, window);
69 
70  const size_t num_anchors = anchors->info()->dimension(1);
71  const float stride = 1.f / anchors_info.spatial_scale();
72  const size_t feat_width = anchors_info.feat_width();
73 
74  const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
75 
76  execute_window_loop(window, [&](const Coordinates & id)
77  {
78  const size_t anchor_offset = id.y() % num_anchors;
79 
80  const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
81  const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
82 
83  const size_t shift_idy = id.y() / num_anchors;
84  const float shiftx = (shift_idy % feat_width) * stride;
85  const float shifty = (shift_idy / feat_width) * stride;
86 
87  const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
88  const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
89  const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
90  const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
91 
92  *out_anchor_ptr = quantize_qsymm16(new_anchor_x1, qinfo.scale);
93  *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
94  *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
95  *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
96  },
97  all_anchors_it);
98 }
int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 16-bit symmetric quantization scheme.
float dequantize_qsymm16(int16_t value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 16-bit symmetric quantization scheme.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
const QuantizationInfo qinfo
Definition: Im2Col.cpp:155
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

◆ compute_region_coordinate()

float arm_compute::cpu::compute_region_coordinate ( int  p,
float  bin_size,
float  roi_anchor,
float  max_value 
)
inline

Definition at line 206 of file impl.cpp.

References arm_compute::utility::clamp().

Referenced by roi_align().

207 {
208  const float region_start = p * bin_size + roi_anchor;
209  return utility::clamp(region_start, 0.0f, max_value);
210 }
DataType clamp(const DataType &n, const DataType &lower=std::numeric_limits< RangeType >::lowest(), const DataType &upper=std::numeric_limits< RangeType >::max())
Performs clamping among a lower and upper value.
Definition: Utility.h:101

◆ directconv3d_float_neon_ndhwc()

void arm_compute::cpu::directconv3d_float_neon_ndhwc ( const ITensor src0,
const ITensor src1,
const ITensor src2,
ITensor dst,
const Conv3dInfo conv_info,
const Window window 
)

Definition at line 39 of file list.h.

References ITensor::buffer(), arm_compute::calculate_max_window(), conv_pad_left, conv_pad_top, Size3D::depth, ITensorInfo::dimension(), Window::DimW, Window::DimX, Window::DimY, Window::DimZ, ITensorInfo::element_size(), arm_compute::execute_window_loop(), Padding3D::front, Size3D::height, ITensor::info(), arm_compute::test::validation::k, Padding3D::left, ITensorInfo::offset_first_element_in_bytes(), Conv3dInfo::padding, Iterator::ptr(), Window::set(), arm_compute::test::validation::src, Conv3dInfo::stride, ITensorInfo::strides_in_bytes(), Padding3D::top, type, arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vloadq(), arm_compute::wrapper::vmla(), arm_compute::vreduce(), arm_compute::wrapper::vsetlane(), Size3D::width, Dimensions< T >::y(), and Dimensions< T >::z().

40 {
41  const ITensor *src = src0;
42  const ITensor *weights = src1;
43  const ITensor *biases = src2;
44 
45  using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
46  using vector_type = typename vtype::type;
47  using tag_type = typename vtype::tag_type;
48  constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
49 
50  // Scalar quantities (N D H W Cin)
51  const int element_size = src->info()->element_size();
52  const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
53  const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
54  const int input_stride_d = src->info()->strides_in_bytes()[3] / element_size;
55  const int input_stride_n