Data Structures
struct	brelu
	Bounded RELU activation object. More...

class	compare_dimension
	Function to compare two Dimensions objects and throw an error on mismatch. More...

struct	dummy
	Dummy activation object. More...

struct	get_tensor_info_t
	Get the info for a tensor, dummy struct. More...

struct	get_tensor_info_t< ITensorInfo * >
	Get the info for a tensor. More...

struct	Header

struct	linear
	Linear activation object. More...

struct	logistic
	Logistic activation object. More...

struct	lubrelu
	Lower-Upper Bounded RELU activation object. More...

struct	relu
	RELU activation object. More...

struct	square
	Square activation object. More...

Enumerations
enum	ObjectType : uint32_t { Context = 1, Queue = 2, Tensor = 3, TensorPack = 4, Operator = 5, Invalid = 0x56DEAD78 }
	< Object type enumerations More...

Functions
template<typename T >
bool	have_different_dimensions (const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)

template<typename F >
arm_compute::Status	for_each_error (F &&)

template<typename F , typename T , typename... Ts>
arm_compute::Status	for_each_error (F &&func, T &&arg, Ts &&... args)

StatusCode	validate_internal_context (const IContext *ctx)
	Check if an internal context is valid. More...

StatusCode	validate_internal_queue (const IQueue *queue)
	Check if an internal queue is valid. More...

StatusCode	validate_internal_tensor (const ITensorV2 *tensor)
	Check if an internal tensor is valid. More...

StatusCode	validate_internal_pack (const TensorPack *pack)
	Check if an internal TensorPack is valid. More...

TensorInfo	convert_to_legacy_tensor_info (const AclTensorDescriptor &desc)
	Convert a descriptor to a legacy format one. More...

AclTensorDescriptor	convert_to_descriptor (const TensorInfo &info)
	Convert a legacy tensor meta-data to a descriptor. More...

float32x4x3_t	load_matrix_row (const float *ptr)

template<unsigned int stridex>
float32x4x2_t	convolve_3x3 (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 1 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 2 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 3 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<unsigned int stridex>
void	store_results (float *buffer, const float32x4x2_t &values)
	Stores a float32x4x2_t array into a memory location. More...

template<>
void	store_results< 1 > (float *buffer, const float32x4x2_t &values)

template<>
void	store_results< 2 > (float *buffer, const float32x4x2_t &values)

template<>
void	store_results< 3 > (float *buffer, const float32x4x2_t &values)

template<unsigned int stridex>
int	get_input_num_elems_processed (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 1 > (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 2 > (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 3 > (unsigned int num_elems_written_per_iteration)

float32x4x3_t	load_matrix_row (const float *ptr, int weights_offset=0)
	Loads a 3x3 matrix as a row (float). More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4x3_t	load_matrix_row (const T *ptr, int weights_offset=0)
	Loads a 3x3 matrix as a row (uint8_t/int8_t). More...

template<unsigned int stridex>
void	store_results (int32_t *buffer, const int32x4x2_t &values)
	Stores a uint32_t array into a memory location. More...

template<>
void	store_results< 1 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	store_results< 2 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	store_results< 3 > (int32_t *buffer, const int32x4x2_t &values)

template<unsigned int stridex>
void	accumulate_results (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 1 > (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 2 > (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 3 > (float *buffer, const float32x4x2_t &values)

template<unsigned int stridex>
void	accumulate_results (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 1 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 2 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 3 > (int32_t *buffer, const int32x4x2_t &values)

float32x4_t	single_convolve_3x3_dilation (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, int input_offset)
	Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...

float32x4x2_t	convolve_3x3_dilation (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
	Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...

template<bool accumulate>
void	convolve_3x3 (const float in_top, const float in_mid, const float in_low, float out_ptr, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, unsigned int stridex, int input_offset=0)
	Perform a convolve3x3 on float32. More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4_t	single_convolve_3x3_dilation (const T in_top, const T in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
	Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4x2_t	convolve_3x3_dilation (const T in_top, const T in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset)
	Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...

template<bool accumulate, typename T1 , typename T2 , ARM_COMPUTE_REQUIRES_TA(std::is_same< T1, uint8_t >::value\|\|std::is_same< T1, int8_t >::value) >
void	convolve_3x3 (const T1 in_top, const T1 in_mid, const T1 in_low, T2 out_ptr, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, unsigned int stridex, int32_t input_offset)
	Perform a convolve3x3 on 8-bit elements. More...

int	get_input_num_elems_processed (unsigned int num_elems_written_per_iteration, unsigned int stridex)
	__ARM_FEATURE_FP16_VECTOR_ARITHMETIC More...

Enumeration Type Documentation

◆ ObjectType

enum ObjectType : uint32_t

strong

< Object type enumerations

Enumerator
Context
Queue
Tensor
TensorPack
Operator
Invalid

Definition at line 37 of file Object.h.

                       : uint32_t
 {
     Context    = 1,
     Queue      = 2,
     Tensor     = 3,
     TensorPack = 4,
     Operator   = 5,
     Invalid    = 0x56DEAD78
 };

Function Documentation

◆ accumulate_results() [1/2]

void arm_compute::detail::accumulate_results	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

◆ accumulate_results() [2/2]

void arm_compute::detail::accumulate_results	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

◆ accumulate_results< 1 >() [1/2]

void arm_compute::detail::accumulate_results< 1 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 144 of file NEDirectConvolutionDetail.h.

 {
     vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
     vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
 }

Referenced by convolve_3x3().

◆ accumulate_results< 1 >() [2/2]

void arm_compute::detail::accumulate_results< 1 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 166 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
     vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
 }

◆ accumulate_results< 2 >() [1/2]

void arm_compute::detail::accumulate_results< 2 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 151 of file NEDirectConvolutionDetail.h.

 {
     vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
 }

Referenced by convolve_3x3().

◆ accumulate_results< 2 >() [2/2]

void arm_compute::detail::accumulate_results< 2 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 173 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
 }

◆ accumulate_results< 3 >() [1/2]

void arm_compute::detail::accumulate_results< 3 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 157 of file NEDirectConvolutionDetail.h.

 {
     vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
 }

Referenced by convolve_3x3().

◆ accumulate_results< 3 >() [2/2]

void arm_compute::detail::accumulate_results< 3 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 179 of file NEDirectConvolutionDetail.h.

 {
     vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
 }

◆ convert_to_descriptor()

AclTensorDescriptor convert_to_descriptor ( const TensorInfo & info )

Convert a legacy tensor meta-data to a descriptor.

Parameters

[in] info Legacy tensor meta-data

Returns: A converted descriptor

Definition at line 97 of file LegacySupport.cpp.

 {
     const auto          num_dims = info.num_dimensions();
     AclTensorDescriptor desc
     {
         static_cast<int32_t>(num_dims),
         create_tensor_shape_array(info),
         convert_to_c_data_type(info.data_type()),
         nullptr,
         0
     };
     return desc;
 }

References arm_compute::test::validation::info.

Referenced by ITensorV2::get_descriptor().

◆ convert_to_legacy_tensor_info()

TensorInfo convert_to_legacy_tensor_info ( const AclTensorDescriptor & desc )

Convert a descriptor to a legacy format one.

Parameters

[in] desc Descriptor to convert

Returns: Legacy tensor meta-data

Definition at line 90 of file LegacySupport.cpp.

 {
     TensorInfo legacy_desc;
     legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, convert_to_legacy_data_type(desc.data_type));
     return legacy_desc;
 }

References AclTensorDescriptor::data_type, TensorInfo::init(), AclTensorDescriptor::ndims, and AclTensorDescriptor::shape.

Referenced by ClTensor::ClTensor(), and CpuTensor::CpuTensor().

◆ convolve_3x3() [1/3]

float32x4x2_t arm_compute::detail::convolve_3x3	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

◆ convolve_3x3() [2/3]

void convolve_3x3	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		float *	out_ptr,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		unsigned int	stridex,
		int	input_offset = `0`
	)

inline

Perform a convolve3x3 on float32.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[out]	out_ptr	Pointer to the output.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 352 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
 
     float32x4x2_t out =
     {
         {
             vdupq_n_f32(0.f),
             vdupq_n_f32(0.f)
         }
     };
     if(stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
         const float32x4x2_t vlow     = vld2q_f32(in_low);
         const float32x4_t   vtop_end = vld1q_f32(in_top + 8);
         const float32x4_t   vmid_end = vld1q_f32(in_mid + 8);
         const float32x4_t   vlow_end = vld1q_f32(in_low + 8);
 
         out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
     else
     {
         const float32x4x3_t vtop =
         {
             {
                 vld1q_f32(in_top),
                 vld1q_f32(in_top + 4),
                 vld1q_f32(in_top + 8)
             }
         };
         const float32x4x3_t vmid =
         {
             {
                 vld1q_f32(in_mid),
                 vld1q_f32(in_mid + 4),
                 vld1q_f32(in_mid + 8)
             }
         };
         const float32x4x3_t vlow =
         {
             {
                 vld1q_f32(in_low),
                 vld1q_f32(in_low + 4),
                 vld1q_f32(in_low + 8)
             }
         };
         out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
         out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
 
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
 
         out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
 
         out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
 
         if(stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
         }
         else
         {
             accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
         }
     }
 }

References arm_compute::test::validation::reference::accumulate(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_UNUSED, store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

◆ convolve_3x3() [3/3]

void arm_compute::detail::convolve_3x3	(	const T1 *	in_top,
		const T1 *	in_mid,
		const T1 *	in_low,
		T2 *	out_ptr,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		unsigned int	stridex,
		int32_t	input_offset
	)

Perform a convolve3x3 on 8-bit elements.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[out]	out_ptr	Pointer to the output.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 593 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
     const VectorType vtop =
     {
         {
             wrapper::vload(in_top),
             wrapper::vload(in_top + 8)
         }
     };
     const VectorType vmid =
     {
         {
             wrapper::vload(in_mid),
             wrapper::vload(in_mid + 8)
         }
     };
     const VectorType vlow =
     {
         {
             wrapper::vload(in_low),
             wrapper::vload(in_low + 8)
         }
     };
 
     const int32x4x3_t vtop_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
         }
     };
     const int32x4x3_t vmid_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
         }
     };
     const int32x4x3_t vlow_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
         }
     };
 
     int32x4x2_t out
     {
         {
             wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
             wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
         }
     };
 
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
 
     out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
 
     out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
 
     // 1
     out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
 
     out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
 
     out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
 
     if(stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
     else if(stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
     else if(stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
     }
 }

◆ convolve_3x3< 1 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 1 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 51 of file NEDirectConvolution3x3.h.

 {
     const float32x4x3_t vtop =
     {
         {
             vld1q_f32(in_top),
             vld1q_f32(in_top + 4),
             vld1q_f32(in_top + 8)
         }
     };
     const float32x4x3_t vmid =
     {
         {
             vld1q_f32(in_mid),
             vld1q_f32(in_mid + 4),
             vld1q_f32(in_mid + 8)
         }
     };
     const float32x4x3_t vlow =
     {
         {
             vld1q_f32(in_low),
             vld1q_f32(in_low + 4),
             vld1q_f32(in_low + 8)
         }
     };
     float32x4x2_t out =
     {
         {
             vmulq_f32(vtop.val[0], m0.val[0]),
             vmulq_f32(vtop.val[1], m0.val[0])
         }
     };
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
 
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
 
     out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
 
     out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
     return out;
 }

Referenced by convolve_3x3< 2 >(), and convolve_3x3< 3 >().

◆ convolve_3x3< 2 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 2 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 109 of file NEDirectConvolution3x3.h.

 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     return out;
 }

References convolve_3x3< 1 >().

◆ convolve_3x3< 3 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 3 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 119 of file NEDirectConvolution3x3.h.

 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }

References convolve_3x3< 1 >().

◆ convolve_3x3_dilation() [1/2]

float32x4x2_t arm_compute::detail::convolve_3x3_dilation	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		const size_t	dilation_x,
		unsigned int	stridex,
		int	input_offset = `0`
	)

inline

Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 306 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     float32x4x2_t out =
     {
         {
             single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
             single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
         }
     };
 
     if(stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
     else if(stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
 
     return out;
 }

References ARM_COMPUTE_ERROR_ON, and single_convolve_3x3_dilation().

◆ convolve_3x3_dilation() [2/2]

int32x4x2_t arm_compute::detail::convolve_3x3_dilation	(	const T *	in_top,
		const T *	in_mid,
		const T *	in_low,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		const size_t	dilation_x,
		unsigned int	stridex,
		int	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 554 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     int32x4x2_t out =
     {
         {
             single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
             single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
         }
     };
 
     if(stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
     else if(stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
     return out;
 }

References ARM_COMPUTE_ERROR_ON, single_convolve_3x3_dilation(), arm_compute::wrapper::vgetlane(), and arm_compute::wrapper::vsetlane().

◆ for_each_error() [1/2]

arm_compute::Status arm_compute::detail::for_each_error ( F && )

inline

Definition at line 104 of file Validate.h.

 {
     return arm_compute::Status{};
 }

Referenced by arm_compute::error_on_mismatching_dimensions(), and for_each_error().

◆ for_each_error() [2/2]

arm_compute::Status arm_compute::detail::for_each_error	(	F &&	func,
		T &&	arg,
		Ts &&...	args
	)

inline

Definition at line 110 of file Validate.h.

 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
     return arm_compute::Status{};
 }

References GemmTuner::args, ARM_COMPUTE_RETURN_ON_ERROR, for_each_error(), and func.

◆ get_input_num_elems_processed() [1/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int num_elems_written_per_iteration )

◆ get_input_num_elems_processed() [2/2]

int arm_compute::detail::get_input_num_elems_processed	(	unsigned int	num_elems_written_per_iteration,
		unsigned int	stridex
	)

inline

__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Get the number of elements processed on 3x3 convolution.

Parameters

[in]	num_elems_written_per_iteration	Number of elements written per iteration on 3x3 convolution.
[in]	stridex	Stride value in elements across x.

Returns: The number of elements processed.

Definition at line 947 of file NEDirectConvolutionDetail.h.

 {
     switch(stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
         case 2:
             return num_elems_written_per_iteration << 1;
         case 3:
             return num_elems_written_per_iteration * 3;
         default:
             ARM_COMPUTE_ERROR("stridex not supported");
             return 0;
     }
 }

References ARM_COMPUTE_ERROR.

◆ get_input_num_elems_processed< 1 >()

int arm_compute::detail::get_input_num_elems_processed< 1 > ( unsigned int num_elems_written_per_iteration )

Definition at line 152 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration;
 }

◆ get_input_num_elems_processed< 2 >()

int arm_compute::detail::get_input_num_elems_processed< 2 > ( unsigned int num_elems_written_per_iteration )

Definition at line 158 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration << 1;
 }

◆ get_input_num_elems_processed< 3 >()

int arm_compute::detail::get_input_num_elems_processed< 3 > ( unsigned int num_elems_written_per_iteration )

Definition at line 164 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration * 3;
 }

◆ have_different_dimensions()

bool arm_compute::detail::have_different_dimensions	(	const Dimensions< T > &	dim1,
		const Dimensions< T > &	dim2,
		unsigned int	upper_dim
	)

inline

Definition at line 47 of file Validate.h.

 {
     for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
         if(dim1[i] != dim2[i])
         {
             return true;
         }
     }
 
     return false;
 }

Referenced by arm_compute::error_on_mismatching_shapes(), arm_compute::error_on_tensors_not_even(), arm_compute::error_on_tensors_not_subsampled(), compare_dimension< T >::operator()(), and NELogicalKernel::validate().

◆ load_matrix_row() [1/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float * ptr )

inline

Definition at line 34 of file NEDirectConvolution3x3.h.

 {
     const float32x4x3_t r =
     {
         {
             vld1q_dup_f32(ptr),
             vld1q_dup_f32(1 + ptr),
             vld1q_dup_f32(2 + ptr)
         }
     };
     return r;
 }

◆ load_matrix_row() [2/3]

float32x4x3_t arm_compute::detail::load_matrix_row	(	const float *	ptr,
		int	weights_offset = `0`
	)

inline

Loads a 3x3 matrix as a row (float).

Parameters

[in]	ptr	Pointer to a float 3x3 matrix.
[in]	weights_offset	(Optional) Weights quantization offset.

Returns: The loaded matrix.

Definition at line 45 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_UNUSED(weights_offset);
     const float32x4x3_t r =
     {
         {
             vld1q_dup_f32(ptr),
             vld1q_dup_f32(1 + ptr),
             vld1q_dup_f32(2 + ptr)
         }
     };
     return r;
 }

References ARM_COMPUTE_UNUSED.

◆ load_matrix_row() [3/3]

int32x4x3_t arm_compute::detail::load_matrix_row	(	const T *	ptr,
		int	weights_offset = `0`
	)

inline

Loads a 3x3 matrix as a row (uint8_t/int8_t).

Parameters

[in]	ptr	Pointer to a uint8_t/int8_t 3x3 matrix.
[in]	weights_offset	(Optional) Weights quantization offset.

Returns: The loaded matrix.

Definition at line 67 of file NEDirectConvolutionDetail.h.

 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
 
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
     int32x4x3_t r =
     {
         {
             vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
             vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
             vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
         }
     };
     return r;
 }

◆ single_convolve_3x3_dilation() [1/2]

float32x4_t arm_compute::detail::single_convolve_3x3_dilation	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		const size_t	dilation_x,
		int	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 248 of file NEDirectConvolutionDetail.h.

 {
     ARM_COMPUTE_UNUSED(input_offset);
 
     const float32x4x3_t vtop =
     {
         {
             vld1q_f32(in_top),
             vld1q_f32(in_top + dilation_x),
             vld1q_f32(in_top + 2 * dilation_x)
         }
     };
     const float32x4x3_t vmid =
     {
         {
             vld1q_f32(in_mid),
             vld1q_f32(in_mid + dilation_x),
             vld1q_f32(in_mid + 2 * dilation_x)
         }
     };
     const float32x4x3_t vlow =
     {
         {
             vld1q_f32(in_low),
             vld1q_f32(in_low + dilation_x),
             vld1q_f32(in_low + 2 * dilation_x)
         }
     };
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
 
     out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
     out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
     out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
 
     out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
     out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
     out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
 
     return out;
 }

References ARM_COMPUTE_UNUSED.

Referenced by convolve_3x3_dilation().

◆ single_convolve_3x3_dilation() [2/2]

int32x4_t arm_compute::detail::single_convolve_3x3_dilation	(	const T *	in_top,
		const T *	in_mid,
		const T *	in_low,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		size_t	dilation_x,
		int32_t	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 466 of file NEDirectConvolutionDetail.h.

 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
     const VectorType vtop =
     {
         {
             wrapper::vload(in_top),
             wrapper::vload(in_top + dilation_x),
             wrapper::vload(in_top + 2 * dilation_x)
         }
     };
     const VectorType vmid =
     {
         {
             wrapper::vload(in_mid),
             wrapper::vload(in_mid + dilation_x),
             wrapper::vload(in_mid + 2 * dilation_x)
         }
     };
     const VectorType vlow =
     {
         {
             wrapper::vload(in_low),
             wrapper::vload(in_low + dilation_x),
             wrapper::vload(in_low + 2 * dilation_x)
         }
     };
 
     const int32x4x3_t vtop_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
         }
     };
     const int32x4x3_t vmid_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
         }
     };
     const int32x4x3_t vlow_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
         }
     };
 
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
     out           = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
 
     out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
     out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
     out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
 
     out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
     out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
     out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
 
     return out;
 }

References type, arm_compute::wrapper::vaddw(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vload(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vmul(), and arm_compute::wrapper::vreinterpret().

◆ store_results() [1/2]

void arm_compute::detail::store_results	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

Stores a uint32_t array into a memory location.

Parameters

[in]	buffer	Pointer to the memory location where the values will be stored.
[in]	values	Values that will be stored.

◆ store_results() [2/2]

void store_results	(	float *	buffer,
		const float32x4x2_t &	values
	)

Stores a float32x4x2_t array into a memory location.

Parameters

[in]	buffer	Pointer to the memory location where the values will be stored.
[in]	values	Values that will be stored.

◆ store_results< 1 >() [1/2]

void arm_compute::detail::store_results< 1 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 122 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, values.val[0]);
     vst1q_s32(buffer + 4, values.val[1]);
 }

◆ store_results< 1 >() [2/2]

void store_results< 1 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 130 of file NEDirectConvolution3x3.h.

 {
     vst1q_f32(buffer, values.val[0]);
     vst1q_f32(buffer + 4, values.val[1]);
 }

Referenced by convolve_3x3().

◆ store_results< 2 >() [1/2]

void arm_compute::detail::store_results< 2 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 129 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, values.val[0]);
 }

◆ store_results< 2 >() [2/2]

void store_results< 2 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 137 of file NEDirectConvolution3x3.h.

 {
     vst1q_f32(buffer, values.val[0]);
 }

Referenced by convolve_3x3().

◆ store_results< 3 >() [1/2]

void arm_compute::detail::store_results< 3 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 135 of file NEDirectConvolutionDetail.h.

 {
     vst1_s32(buffer, vget_low_s32(values.val[0]));
 }

◆ store_results< 3 >() [2/2]

void store_results< 3 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 143 of file NEDirectConvolution3x3.h.

 {
     vst1_f32(buffer, vget_low_f32(values.val[0]));
 }

Referenced by convolve_3x3().

◆ validate_internal_context()

StatusCode arm_compute::detail::validate_internal_context ( const IContext * ctx )

inline

Check if an internal context is valid.

Parameters

[in] ctx Internal context to check

Returns: A status code

Definition at line 143 of file IContext.h.

 {
     if(ctx == nullptr || !ctx->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, IContext::is_valid(), and arm_compute::Success.

Referenced by AclCreateQueue(), AclCreateTensor(), AclCreateTensorPack(), AclDestroyContext(), AclGetClContext(), AclGetClDevice(), and AclSetClContext().

◆ validate_internal_pack()

StatusCode arm_compute::detail::validate_internal_pack ( const TensorPack * pack )

inline

Check if an internal TensorPack is valid.

Parameters

[in] pack Internal tensor pack to check

Returns: A status code

Definition at line 119 of file TensorPack.h.

 {
     if(pack == nullptr || !pack->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, TensorPack::is_valid(), and arm_compute::Success.

Referenced by AclDestroyTensorPack(), AclPackTensor(), and AclPackTensors().

◆ validate_internal_queue()

StatusCode arm_compute::detail::validate_internal_queue ( const IQueue * queue )

inline

Check if an internal queue is valid.

Parameters

[in] queue Internal queue to check

Returns: A status code

Definition at line 89 of file IQueue.h.

 {
     if(queue == nullptr || !queue->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, IQueue::is_valid(), and arm_compute::Success.

Referenced by AclDestroyQueue(), AclGetClQueue(), AclQueueFinish(), and AclSetClQueue().

◆ validate_internal_tensor()

StatusCode arm_compute::detail::validate_internal_tensor ( const ITensorV2 * tensor )

inline

Check if an internal tensor is valid.

Parameters

[in] tensor Internal tensor to check

Returns: A status code

Definition at line 129 of file ITensorV2.h.

 {
     if(tensor == nullptr || !tensor->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object");
         return StatusCode::InvalidArgument;
     }
     return StatusCode::Success;
 }

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, ITensorV2::is_valid(), and arm_compute::Success.

Referenced by AclDestroyTensor(), AclGetClMem(), AclMapTensor(), AclTensorImport(), and AclUnmapTensor().

Data Structures

Enumerations

Functions

Enumeration Type Documentation

◆ ObjectType

Function Documentation

◆ accumulate_results() [1/2]

◆ accumulate_results() [2/2]

◆ accumulate_results< 1 >() [1/2]

◆ accumulate_results< 1 >() [2/2]

◆ accumulate_results< 2 >() [1/2]

◆ accumulate_results< 2 >() [2/2]

◆ accumulate_results< 3 >() [1/2]

◆ accumulate_results< 3 >() [2/2]

◆ convert_to_descriptor()

◆ convert_to_legacy_tensor_info()

◆ convolve_3x3() [1/3]

◆ convolve_3x3() [2/3]

◆ convolve_3x3() [3/3]

◆ convolve_3x3< 1 >()

◆ convolve_3x3< 2 >()

◆ convolve_3x3< 3 >()

◆ convolve_3x3_dilation() [1/2]

◆ convolve_3x3_dilation() [2/2]

◆ for_each_error() [1/2]

◆ for_each_error() [2/2]

◆ get_input_num_elems_processed() [1/2]

◆ get_input_num_elems_processed() [2/2]

◆ get_input_num_elems_processed< 1 >()

◆ get_input_num_elems_processed< 2 >()

◆ get_input_num_elems_processed< 3 >()

◆ have_different_dimensions()

◆ load_matrix_row() [1/3]

◆ load_matrix_row() [2/3]

◆ load_matrix_row() [3/3]

◆ single_convolve_3x3_dilation() [1/2]

◆ single_convolve_3x3_dilation() [2/2]

◆ store_results() [1/2]

◆ store_results() [2/2]

◆ store_results< 1 >() [1/2]

◆ store_results< 1 >() [2/2]

◆ store_results< 2 >() [1/2]

◆ store_results< 2 >() [2/2]

◆ store_results< 3 >() [1/2]

◆ store_results< 3 >() [2/2]

◆ validate_internal_context()

◆ validate_internal_pack()

◆ validate_internal_queue()

◆ validate_internal_tensor()