Data Structures
struct	brelu
	Bounded RELU activation object. More...

class	compare_dimension
	Function to compare two Dimensions objects and throw an error on mismatch. More...

struct	dummy
	Dummy activation object. More...

struct	get_tensor_info_t
	Get the info for a tensor, dummy struct. More...

struct	get_tensor_info_t< ITensorInfo * >
	Get the info for a tensor. More...

struct	Header

struct	linear
	Linear activation object. More...

struct	logistic
	Logistic activation object. More...

struct	lubrelu
	Lower-Upper Bounded RELU activation object. More...

struct	relu
	RELU activation object. More...

struct	square
	Square activation object. More...

Enumerations
enum	ObjectType : uint32_t { Context = 1, Queue = 2, Tensor = 3, TensorPack = 4, Operator = 5, Invalid = 0x56DEAD78 }
	< Object type enumerations More...

Functions
template<typename T >
bool	have_different_dimensions (const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)

template<typename F >
arm_compute::Status	for_each_error (F &&)

template<typename F , typename T , typename... Ts>
arm_compute::Status	for_each_error (F &&func, T &&arg, Ts &&...args)

StatusCode	validate_internal_context (const IContext *ctx)
	Check if an internal context is valid. More...

StatusCode	validate_internal_operator (const IOperator *op)
	Check if an internal operator is valid. More...

StatusCode	validate_internal_queue (const IQueue *queue)
	Check if an internal queue is valid. More...

StatusCode	validate_internal_tensor (const ITensorV2 *tensor)
	Check if an internal tensor is valid. More...

StatusCode	validate_internal_pack (const TensorPack *pack)
	Check if an internal TensorPack is valid. More...

TensorInfo	convert_to_legacy_tensor_info (const AclTensorDescriptor &desc)
	Convert a descriptor to a legacy format one. More...

AclTensorDescriptor	convert_to_descriptor (const TensorInfo &info)
	Convert a legacy tensor meta-data to a descriptor. More...

ActivationLayerInfo	convert_to_activation_info (const AclActivationDescriptor &desc)
	Convert an AclActivation descriptor to an internal one. More...

float32x4x3_t	load_matrix_row (const float *ptr)

template<unsigned int stridex>
float32x4x2_t	convolve_3x3 (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 1 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 2 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 3 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<unsigned int stridex>
void	store_results (float *buffer, const float32x4x2_t &values)
	Stores a float32x4x2_t array into a memory location. More...

template<>
void	store_results< 1 > (float *buffer, const float32x4x2_t &values)

template<>
void	store_results< 2 > (float *buffer, const float32x4x2_t &values)

template<>
void	store_results< 3 > (float *buffer, const float32x4x2_t &values)

template<unsigned int stridex>
int	get_input_num_elems_processed (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 1 > (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 2 > (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 3 > (unsigned int num_elems_written_per_iteration)

float32x4x3_t	load_matrix_row (const float *ptr, int weights_offset=0)
	Loads a 3x3 matrix as a row (float). More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4x3_t	load_matrix_row (const T *ptr, int weights_offset=0)
	Loads a 3x3 matrix as a row (uint8_t/int8_t). More...

template<unsigned int stridex>
void	store_results (int32_t *buffer, const int32x4x2_t &values)
	Stores a uint32_t array into a memory location. More...

template<>
void	store_results< 1 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	store_results< 2 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	store_results< 3 > (int32_t *buffer, const int32x4x2_t &values)

template<unsigned int stridex>
void	accumulate_results (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 1 > (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 2 > (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 3 > (float *buffer, const float32x4x2_t &values)

template<unsigned int stridex>
void	accumulate_results (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 1 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 2 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 3 > (int32_t *buffer, const int32x4x2_t &values)

float32x4_t	single_convolve_3x3_dilation (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, int input_offset)
	Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...

float32x4x2_t	convolve_3x3_dilation (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
	Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...

template<bool accumulate>
void	convolve_3x3 (const float in_top, const float in_mid, const float in_low, float out_ptr, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, unsigned int stridex, int input_offset=0)
	Perform a convolve3x3 on float32. More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4_t	single_convolve_3x3_dilation (const T in_top, const T in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
	Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4x2_t	convolve_3x3_dilation (const T in_top, const T in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset)
	Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...

template<bool accumulate, typename T1 , typename T2 , ARM_COMPUTE_REQUIRES_TA(std::is_same< T1, uint8_t >::value\|\|std::is_same< T1, int8_t >::value) >
void	convolve_3x3 (const T1 in_top, const T1 in_mid, const T1 in_low, T2 out_ptr, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, unsigned int stridex, int32_t input_offset)
	Perform a convolve3x3 on 8-bit elements. More...

int	get_input_num_elems_processed (unsigned int num_elems_written_per_iteration, unsigned int stridex)
	__ARM_FEATURE_FP16_VECTOR_ARITHMETIC More...

Enumeration Type Documentation

◆ ObjectType

enum ObjectType : uint32_t

strong

< Object type enumerations

Enumerator
Context
Queue
Tensor
TensorPack
Operator
Invalid

Definition at line 37 of file Object.h.

                       : uint32_t
 {
     Context    = 1,
     Queue      = 2,
     Tensor     = 3,
     TensorPack = 4,
     Operator   = 5,
     Invalid    = 0x56DEAD78
 };

Function Documentation

◆ accumulate_results() [1/2]

void arm_compute::detail::accumulate_results	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

◆ accumulate_results() [2/2]

void arm_compute::detail::accumulate_results	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

◆ accumulate_results< 1 >() [1/2]

void arm_compute::detail::accumulate_results< 1 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 132 of file NEDirectConvolutionDetail.h.

 {
     vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
     vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
 }

Referenced by convolve_3x3().

◆ accumulate_results< 1 >() [2/2]

void arm_compute::detail::accumulate_results< 1 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 154 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
     vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
 }

◆ accumulate_results< 2 >() [1/2]

void arm_compute::detail::accumulate_results< 2 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 139 of file NEDirectConvolutionDetail.h.

 {
     vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
 }

Referenced by convolve_3x3().

◆ accumulate_results< 2 >() [2/2]

void arm_compute::detail::accumulate_results< 2 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 161 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
 }

◆ accumulate_results< 3 >() [1/2]

void arm_compute::detail::accumulate_results< 3 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 145 of file NEDirectConvolutionDetail.h.

 {
     vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
 }

Referenced by convolve_3x3().

◆ accumulate_results< 3 >() [2/2]

void arm_compute::detail::accumulate_results< 3 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 167 of file NEDirectConvolutionDetail.h.

 {
     vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
 }

◆ convert_to_activation_info()

ActivationLayerInfo convert_to_activation_info ( const AclActivationDescriptor & desc )

Convert an AclActivation descriptor to an internal one.

Parameters

[in] desc Descriptor to convert

Returns: Legacy tensor meta-data

Definition at line 108 of file LegacySupport.cpp.

 {
     ActivationLayerInfo::ActivationFunction act;
     switch (desc.type)
     {
         case AclActivationType::AclIdentity:
             act = ActivationLayerInfo::ActivationFunction::IDENTITY;
             break;
         case AclActivationType::AclLogistic:
             act = ActivationLayerInfo::ActivationFunction::LOGISTIC;
             break;
         case AclActivationType::AclTanh:
             act = ActivationLayerInfo::ActivationFunction::TANH;
             break;
         case AclActivationType::AclRelu:
             act = ActivationLayerInfo::ActivationFunction::RELU;
             break;
         case AclActivationType::AclBoundedRelu:
             act = ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
             break;
         case AclActivationType::AclLuBoundedRelu:
             act = ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU;
             break;
         case AclActivationType::AclLeakyRelu:
             act = ActivationLayerInfo::ActivationFunction::LEAKY_RELU;
             break;
         case AclActivationType::AclSoftRelu:
             act = ActivationLayerInfo::ActivationFunction::SOFT_RELU;
             break;
         case AclActivationType::AclElu:
             act = ActivationLayerInfo::ActivationFunction::ELU;
             break;
         case AclActivationType::AclAbs:
             act = ActivationLayerInfo::ActivationFunction::ABS;
             break;
         case AclActivationType::AclSquare:
             act = ActivationLayerInfo::ActivationFunction::SQUARE;
             break;
         case AclActivationType::AclSqrt:
             act = ActivationLayerInfo::ActivationFunction::SQRT;
             break;
         case AclActivationType::AclLinear:
             act = ActivationLayerInfo::ActivationFunction::LINEAR;
             break;
         case AclActivationType::AclHardSwish:
             act = ActivationLayerInfo::ActivationFunction::HARD_SWISH;
             break;
         default:
             return ActivationLayerInfo();
     }
  
     return ActivationLayerInfo(act, desc.a, desc.b);
 }

References AclActivationDescriptor::a, AclAbs, AclBoundedRelu, AclElu, AclHardSwish, AclIdentity, AclLeakyRelu, AclLinear, AclLogistic, AclLuBoundedRelu, AclRelu, AclSoftRelu, AclSqrt, AclSquare, AclTanh, AclActivationDescriptor::b, and AclActivationDescriptor::type.

Referenced by CpuContext::create_activation(), and ClContext::create_activation().

◆ convert_to_descriptor()

AclTensorDescriptor convert_to_descriptor ( const TensorInfo & info )

Convert a legacy tensor meta-data to a descriptor.

Parameters

[in] info Legacy tensor meta-data

Returns: A converted descriptor

Definition at line 100 of file LegacySupport.cpp.

 {
     const auto          num_dims = info.num_dimensions();
     AclTensorDescriptor desc{static_cast<int32_t>(num_dims), create_tensor_shape_array(info),
                              convert_to_c_data_type(info.data_type()), nullptr, 0};
     return desc;
 }

References arm_compute::test::validation::info.

Referenced by ITensorV2::get_descriptor().

◆ convert_to_legacy_tensor_info()

TensorInfo convert_to_legacy_tensor_info ( const AclTensorDescriptor & desc )

Convert a descriptor to a legacy format one.

Parameters

[in] desc Descriptor to convert

Returns: Legacy tensor meta-data

Definition at line 92 of file LegacySupport.cpp.

 {
     TensorInfo legacy_desc;
     legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1,
                      convert_to_legacy_data_type(desc.data_type));
     return legacy_desc;
 }

References AclTensorDescriptor::data_type, TensorInfo::init(), AclTensorDescriptor::ndims, and AclTensorDescriptor::shape.

Referenced by ClTensor::ClTensor(), CpuTensor::CpuTensor(), CpuContext::create_activation(), and ClContext::create_activation().

◆ convolve_3x3() [1/3]

float32x4x2_t arm_compute::detail::convolve_3x3	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

◆ convolve_3x3() [2/3]

void convolve_3x3	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		float *	out_ptr,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		unsigned int	stridex,
		int	input_offset = `0`
	)

inline

Perform a convolve3x3 on float32.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[out]	out_ptr	Pointer to the output.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 335 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
  
     float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
     if (stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
         const float32x4x2_t vlow     = vld2q_f32(in_low);
         const float32x4_t   vtop_end = vld1q_f32(in_top + 8);
         const float32x4_t   vmid_end = vld1q_f32(in_mid + 8);
         const float32x4_t   vlow_end = vld1q_f32(in_low + 8);
  
         out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
  
         out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
  
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
  
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
  
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
     else
     {
         const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
         const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
         const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
         out.val[0]               = vmulq_f32(vtop.val[0], m0.val[0]);
         out.val[1]               = vmulq_f32(vtop.val[1], m0.val[0]);
  
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
  
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
  
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
  
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
  
         out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
  
         out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
  
         if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
         }
         else
         {
             accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
         }
     }
 }

References arm_compute::test::validation::reference::accumulate(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_UNUSED, store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

◆ convolve_3x3() [3/3]

void arm_compute::detail::convolve_3x3	(	const T1 *	in_top,
		const T1 *	in_mid,
		const T1 *	in_low,
		T2 *	out_ptr,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		unsigned int	stridex,
		int32_t	input_offset
	)

Perform a convolve3x3 on 8-bit elements.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[out]	out_ptr	Pointer to the output.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 539 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
  
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
  
     const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
     const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
     const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
  
     const int32x4x3_t vtop_s32 = {{
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
     }};
     const int32x4x3_t vmid_s32 = {{
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
     }};
     const int32x4x3_t vlow_s32 = {{
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
     }};
  
     int32x4x2_t out{{
         wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
         wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
     }};
  
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
  
     out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
  
     out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
  
     // 1
     out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
  
     out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
  
     out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
  
     if (stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
     else if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
  
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
     else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
     }
 }

◆ convolve_3x3< 1 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 1 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 49 of file NEDirectConvolution3x3.h.

 {
     const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
     const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
     const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
     float32x4x2_t       out  = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
     out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
     out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
  
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
  
     out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
  
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
  
     out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
  
     out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
     return out;
 }

Referenced by convolve_3x3< 2 >(), and convolve_3x3< 3 >().

◆ convolve_3x3< 2 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 2 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 85 of file NEDirectConvolution3x3.h.

 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     return out;
 }

References convolve_3x3< 1 >().

◆ convolve_3x3< 3 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 3 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 100 of file NEDirectConvolution3x3.h.

 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }

References convolve_3x3< 1 >().

◆ convolve_3x3_dilation() [1/2]

float32x4x2_t arm_compute::detail::convolve_3x3_dilation	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		const size_t	dilation_x,
		unsigned int	stridex,
		int	input_offset = `0`
	)

inline

Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 281 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     float32x4x2_t out = {
         {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
          single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
  
     if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
     else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
  
     return out;
 }

References ARM_COMPUTE_ERROR_ON, and single_convolve_3x3_dilation().

◆ convolve_3x3_dilation() [2/2]

int32x4x2_t arm_compute::detail::convolve_3x3_dilation	(	const T *	in_top,
		const T *	in_mid,
		const T *	in_low,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		const size_t	dilation_x,
		unsigned int	stridex,
		int	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 494 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     int32x4x2_t out = {
         {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
          single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
  
     if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
     else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
     return out;
 }

References ARM_COMPUTE_ERROR_ON, single_convolve_3x3_dilation(), arm_compute::wrapper::vgetlane(), and arm_compute::wrapper::vsetlane().

◆ for_each_error() [1/2]

arm_compute::Status arm_compute::detail::for_each_error ( F && )

inline

Definition at line 108 of file Validate.h.

 {
     return arm_compute::Status{};
 }

Referenced by arm_compute::error_on_mismatching_dimensions(), and for_each_error().

◆ for_each_error() [2/2]

arm_compute::Status arm_compute::detail::for_each_error	(	F &&	func,
		T &&	arg,
		Ts &&...	args
	)

inline

Definition at line 114 of file Validate.h.

 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
     return arm_compute::Status{};
 }

References GemmTuner::args, ARM_COMPUTE_RETURN_ON_ERROR, and for_each_error().

◆ get_input_num_elems_processed() [1/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int num_elems_written_per_iteration )

◆ get_input_num_elems_processed() [2/2]

int arm_compute::detail::get_input_num_elems_processed	(	unsigned int	num_elems_written_per_iteration,
		unsigned int	stridex
	)

inline

__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Get the number of elements processed on 3x3 convolution.

Parameters

[in]	num_elems_written_per_iteration	Number of elements written per iteration on 3x3 convolution.
[in]	stridex	Stride value in elements across x.

Returns: The number of elements processed.

Definition at line 830 of file NEDirectConvolutionDetail.h.

 {
     switch (stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
         case 2:
             return num_elems_written_per_iteration << 1;
         case 3:
             return num_elems_written_per_iteration * 3;
         default:
             ARM_COMPUTE_ERROR("stridex not supported");
             return 0;
     }
 }

References ARM_COMPUTE_ERROR.

◆ get_input_num_elems_processed< 1 >()

int arm_compute::detail::get_input_num_elems_processed< 1 > ( unsigned int num_elems_written_per_iteration )

Definition at line 138 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration;
 }

◆ get_input_num_elems_processed< 2 >()

int arm_compute::detail::get_input_num_elems_processed< 2 > ( unsigned int num_elems_written_per_iteration )

Definition at line 144 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration << 1;
 }

◆ get_input_num_elems_processed< 3 >()

int arm_compute::detail::get_input_num_elems_processed< 3 > ( unsigned int num_elems_written_per_iteration )

Definition at line 150 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration * 3;
 }

◆ have_different_dimensions()

bool arm_compute::detail::have_different_dimensions	(	const Dimensions< T > &	dim1,
		const Dimensions< T > &	dim2,
		unsigned int	upper_dim
	)

inline

Definition at line 51 of file Validate.h.

 {
     for (unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
         if (dim1[i] != dim2[i])
         {
             return true;
         }
     }
  
     return false;
 }

Referenced by GpuKernelComponentGroup::add_component(), arm_compute::test::validation::reference::batch_to_space(), compare_dimension< T >::operator()(), GpuOperatorGroup::try_add_operator(), NELogicalKernel::validate(), and ClComponentElementwiseBinary::validate().

◆ load_matrix_row() [1/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float * ptr )

inline

Definition at line 34 of file NEDirectConvolution3x3.h.

 {
     const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }

◆ load_matrix_row() [2/3]

float32x4x3_t arm_compute::detail::load_matrix_row	(	const float *	ptr,
		int	weights_offset = `0`
	)

inline

Loads a 3x3 matrix as a row (float).

Parameters

[in]	ptr	Pointer to a float 3x3 matrix.
[in]	weights_offset	(Optional) Weights quantization offset.

Returns: The loaded matrix.

Definition at line 45 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_UNUSED(weights_offset);
     const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }

References ARM_COMPUTE_UNUSED.

◆ load_matrix_row() [3/3]

int32x4x3_t arm_compute::detail::load_matrix_row	(	const T *	ptr,
		int	weights_offset = `0`
	)

inline

Loads a 3x3 matrix as a row (uint8_t/int8_t).

Parameters

[in]	ptr	Pointer to a uint8_t/int8_t 3x3 matrix.
[in]	weights_offset	(Optional) Weights quantization offset.

Returns: The loaded matrix.

Definition at line 60 of file NEDirectConvolutionDetail.h.

 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
  
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
     int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
                       vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
                       vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
     return r;
 }

◆ single_convolve_3x3_dilation() [1/2]

float32x4_t arm_compute::detail::single_convolve_3x3_dilation	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		const size_t	dilation_x,
		int	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 236 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_UNUSED(input_offset);
  
     const float32x4x3_t vtop = {
         {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
     const float32x4x3_t vmid = {
         {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
     const float32x4x3_t vlow = {
         {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
  
     out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
     out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
     out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
  
     out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
     out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
     out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
  
     return out;
 }

References ARM_COMPUTE_UNUSED.

Referenced by convolve_3x3_dilation().

◆ single_convolve_3x3_dilation() [2/2]

int32x4_t arm_compute::detail::single_convolve_3x3_dilation	(	const T *	in_top,
		const T *	in_mid,
		const T *	in_low,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		size_t	dilation_x,
		int32_t	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 428 of file NEDirectConvolutionDetail.h.

 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
  
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
  
     const VectorType vtop = {
         {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
     const VectorType vmid = {
         {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
     const VectorType vlow = {
         {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
  
     const int32x4x3_t vtop_s32 = {{
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
     }};
     const int32x4x3_t vmid_s32 = {{
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
     }};
     const int32x4x3_t vlow_s32 = {{
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
         wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
     }};
  
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
     out           = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
  
     out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
     out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
     out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
  
     out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
     out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
     out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
  
     return out;
 }

References type, arm_compute::wrapper::vaddw(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vload(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vmul(), and arm_compute::wrapper::vreinterpret().

◆ store_results() [1/2]

void store_results	(	float *	buffer,
		const float32x4x2_t &	values
	)

Stores a float32x4x2_t array into a memory location.

Parameters

[in]	buffer	Pointer to the memory location where the values will be stored.
[in]	values	Values that will be stored.

◆ store_results() [2/2]

void arm_compute::detail::store_results	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

Stores a uint32_t array into a memory location.

Parameters

[in]	buffer	Pointer to the memory location where the values will be stored.
[in]	values	Values that will be stored.

◆ store_results< 1 >() [1/2]

void store_results< 1 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 116 of file NEDirectConvolution3x3.h.

 {
     vst1q_f32(buffer, values.val[0]);
     vst1q_f32(buffer + 4, values.val[1]);
 }

Referenced by convolve_3x3().

◆ store_results< 1 >() [2/2]

void arm_compute::detail::store_results< 1 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 110 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, values.val[0]);
     vst1q_s32(buffer + 4, values.val[1]);
 }

◆ store_results< 2 >() [1/2]

void store_results< 2 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 123 of file NEDirectConvolution3x3.h.

 {
     vst1q_f32(buffer, values.val[0]);
 }

Referenced by convolve_3x3().

◆ store_results< 2 >() [2/2]

void arm_compute::detail::store_results< 2 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 117 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, values.val[0]);
 }

◆ store_results< 3 >() [1/2]

void store_results< 3 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 129 of file NEDirectConvolution3x3.h.

 {
     vst1_f32(buffer, vget_low_f32(values.val[0]));
 }

Referenced by convolve_3x3().

◆ store_results< 3 >() [2/2]

void arm_compute::detail::store_results< 3 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 123 of file NEDirectConvolutionDetail.h.

 {
     vst1_s32(buffer, vget_low_s32(values.val[0]));
 }

◆ validate_internal_context()

StatusCode arm_compute::detail::validate_internal_context ( const IContext * ctx )

inline

Check if an internal context is valid.

Parameters

[in] ctx Internal context to check

Returns: A status code

Definition at line 140 of file IContext.h.

 {
     if (ctx == nullptr || !ctx->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, IContext::is_valid(), and arm_compute::Success.

Referenced by AclActivation(), AclCreateQueue(), AclCreateTensor(), AclCreateTensorPack(), AclDestroyContext(), AclGetClContext(), AclGetClDevice(), and AclSetClContext().

◆ validate_internal_operator()

StatusCode arm_compute::detail::validate_internal_operator ( const IOperator * op )

inline

Check if an internal operator is valid.

Parameters

[in] op Internal operator to check

Returns: A status code

Definition at line 126 of file IOperator.h.

 {
     if (op == nullptr || !op->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IOperator]: Invalid operator object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, and arm_compute::Success.

Referenced by AclDestroyOperator(), and AclRunOperator().

◆ validate_internal_pack()

StatusCode arm_compute::detail::validate_internal_pack ( const TensorPack * pack )

inline

Check if an internal TensorPack is valid.

Parameters

[in] pack Internal tensor pack to check

Returns: A status code

Definition at line 120 of file TensorPack.h.

 {
     if (pack == nullptr || !pack->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, arm_compute::test::validation::pack, and arm_compute::Success.

Referenced by AclDestroyTensorPack(), AclPackTensor(), AclPackTensors(), and AclRunOperator().

◆ validate_internal_queue()

StatusCode arm_compute::detail::validate_internal_queue ( const IQueue * queue )

inline

Check if an internal queue is valid.

Parameters

[in] queue Internal queue to check

Returns: A status code

Definition at line 89 of file IQueue.h.

 {
     if (queue == nullptr || !queue->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, IQueue::is_valid(), and arm_compute::Success.

Referenced by AclDestroyQueue(), AclGetClQueue(), AclQueueFinish(), AclRunOperator(), and AclSetClQueue().

◆ validate_internal_tensor()

StatusCode arm_compute::detail::validate_internal_tensor ( const ITensorV2 * tensor )

inline

Check if an internal tensor is valid.

Parameters

[in] tensor Internal tensor to check

Returns: A status code

Definition at line 128 of file ITensorV2.h.

 {
     if (tensor == nullptr || !tensor->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, arm_compute::Success, and tensor.

Referenced by AclDestroyTensor(), AclGetClMem(), AclMapTensor(), AclTensorImport(), and AclUnmapTensor().

Data Structures

Enumerations

Functions

Enumeration Type Documentation

◆ ObjectType

Function Documentation

◆ accumulate_results() [1/2]

◆ accumulate_results() [2/2]

◆ accumulate_results< 1 >() [1/2]

◆ accumulate_results< 1 >() [2/2]

◆ accumulate_results< 2 >() [1/2]

◆ accumulate_results< 2 >() [2/2]

◆ accumulate_results< 3 >() [1/2]

◆ accumulate_results< 3 >() [2/2]

◆ convert_to_activation_info()

◆ convert_to_descriptor()

◆ convert_to_legacy_tensor_info()

◆ convolve_3x3() [1/3]

◆ convolve_3x3() [2/3]

◆ convolve_3x3() [3/3]

◆ convolve_3x3< 1 >()

◆ convolve_3x3< 2 >()

◆ convolve_3x3< 3 >()

◆ convolve_3x3_dilation() [1/2]

◆ convolve_3x3_dilation() [2/2]

◆ for_each_error() [1/2]

◆ for_each_error() [2/2]

◆ get_input_num_elems_processed() [1/2]

◆ get_input_num_elems_processed() [2/2]

◆ get_input_num_elems_processed< 1 >()

◆ get_input_num_elems_processed< 2 >()

◆ get_input_num_elems_processed< 3 >()

◆ have_different_dimensions()

◆ load_matrix_row() [1/3]

◆ load_matrix_row() [2/3]

◆ load_matrix_row() [3/3]

◆ single_convolve_3x3_dilation() [1/2]

◆ single_convolve_3x3_dilation() [2/2]

◆ store_results() [1/2]

◆ store_results() [2/2]

◆ store_results< 1 >() [1/2]

◆ store_results< 1 >() [2/2]

◆ store_results< 2 >() [1/2]

◆ store_results< 2 >() [2/2]

◆ store_results< 3 >() [1/2]

◆ store_results< 3 >() [2/2]

◆ validate_internal_context()

◆ validate_internal_operator()

◆ validate_internal_pack()

◆ validate_internal_queue()

◆ validate_internal_tensor()