Data Structures
struct	brelu
	Bounded RELU activation object. More...

class	compare_dimension
	Function to compare two Dimensions objects and throw an error on mismatch. More...

struct	dummy
	Dummy activation object. More...

struct	get_tensor_info_t
	Get the info for a tensor, dummy struct. More...

struct	get_tensor_info_t< ITensorInfo * >
	Get the info for a tensor. More...

struct	linear
	Linear activation object. More...

struct	logistic
	Logistic activation object. More...

struct	lubrelu
	Lower-Upper Bounded RELU activation object. More...

struct	relu
	RELU activation object. More...

struct	square
	Square activation object. More...

Functions
template<typename T >
bool	have_different_dimensions (const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)

template<typename F >
arm_compute::Status	for_each_error (F &&)

template<typename F , typename T , typename... Ts>
arm_compute::Status	for_each_error (F &&func, T &&arg, Ts &&... args)

float32x4x3_t	load_matrix_row (const float *ptr)

template<unsigned int stridex>
float32x4x2_t	convolve_3x3 (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 1 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 2 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<>
float32x4x2_t	convolve_3x3< 3 > (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

template<unsigned int stridex>
void	store_results (float *buffer, const float32x4x2_t &values)
	Stores a float32x4x2_t array into a memory location. More...

template<>
void	store_results< 1 > (float *buffer, const float32x4x2_t &values)

template<>
void	store_results< 2 > (float *buffer, const float32x4x2_t &values)

template<>
void	store_results< 3 > (float *buffer, const float32x4x2_t &values)

template<unsigned int stridex>
int	get_input_num_elems_processed (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 1 > (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 2 > (unsigned int num_elems_written_per_iteration)

template<>
int	get_input_num_elems_processed< 3 > (unsigned int num_elems_written_per_iteration)

float32x4x3_t	load_matrix_row (const float *ptr, int weights_offset=0)
	Loads a 3x3 matrix as a row (float). More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4x3_t	load_matrix_row (const T *ptr, int weights_offset=0)
	Loads a 3x3 matrix as a row (uint8_t/int8_t). More...

template<unsigned int stridex>
void	store_results (int32_t *buffer, const int32x4x2_t &values)
	Stores a uint32_t array into a memory location. More...

template<>
void	store_results< 1 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	store_results< 2 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	store_results< 3 > (int32_t *buffer, const int32x4x2_t &values)

template<unsigned int stridex>
void	accumulate_results (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 1 > (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 2 > (float *buffer, const float32x4x2_t &values)

template<>
void	accumulate_results< 3 > (float *buffer, const float32x4x2_t &values)

template<unsigned int stridex>
void	accumulate_results (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 1 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 2 > (int32_t *buffer, const int32x4x2_t &values)

template<>
void	accumulate_results< 3 > (int32_t *buffer, const int32x4x2_t &values)

float32x4_t	single_convolve_3x3_dilation (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, int input_offset)
	Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...

float32x4x2_t	convolve_3x3_dilation (const float in_top, const float in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
	Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...

template<bool accumulate>
void	convolve_3x3 (const float in_top, const float in_mid, const float in_low, float out_ptr, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, unsigned int stridex, int input_offset=0)
	Perform a convolve3x3 on float32. More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4_t	single_convolve_3x3_dilation (const T in_top, const T in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
	Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...

template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value\|\|std::is_same< T, int8_t >::value) >
int32x4x2_t	convolve_3x3_dilation (const T in_top, const T in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset)
	Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...

template<bool accumulate, typename T1 , typename T2 , ARM_COMPUTE_REQUIRES_TA(std::is_same< T1, uint8_t >::value\|\|std::is_same< T1, int8_t >::value) >
void	convolve_3x3 (const T1 in_top, const T1 in_mid, const T1 in_low, T2 out_ptr, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, unsigned int stridex, int32_t input_offset)
	Perform a convolve3x3 on 8-bit elements. More...

int	get_input_num_elems_processed (unsigned int num_elems_written_per_iteration, unsigned int stridex)
	__ARM_FEATURE_FP16_VECTOR_ARITHMETIC More...

Function Documentation

◆ accumulate_results() [1/2]

void arm_compute::detail::accumulate_results	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Referenced by accumulate_results< 3 >(), and store_results< 3 >().

◆ accumulate_results() [2/2]

void arm_compute::detail::accumulate_results	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

◆ accumulate_results< 1 >() [1/2]

void arm_compute::detail::accumulate_results< 1 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 145 of file NEDirectConvolutionDetail.h.

Referenced by accumulate_results< 3 >(), and convolve_3x3().

 {
     vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
     vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
 }

◆ accumulate_results< 1 >() [2/2]

void arm_compute::detail::accumulate_results< 1 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 167 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
     vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
 }

◆ accumulate_results< 2 >() [1/2]

void arm_compute::detail::accumulate_results< 2 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 152 of file NEDirectConvolutionDetail.h.

Referenced by accumulate_results< 3 >(), and convolve_3x3().

 {
     vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
 }

◆ accumulate_results< 2 >() [2/2]

void arm_compute::detail::accumulate_results< 2 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 174 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
 }

◆ accumulate_results< 3 >() [1/2]

void arm_compute::detail::accumulate_results< 3 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 158 of file NEDirectConvolutionDetail.h.

References accumulate_results().

Referenced by accumulate_results< 3 >(), and convolve_3x3().

 {
     vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
 }

◆ accumulate_results< 3 >() [2/2]

void arm_compute::detail::accumulate_results< 3 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 180 of file NEDirectConvolutionDetail.h.

References accumulate_results(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), store_results(), store_results< 1 >(), store_results< 2 >(), store_results< 3 >(), vadd_f16(), and vaddq_f16().

 {
     vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
 }

◆ convolve_3x3() [1/3]

float32x4x2_t arm_compute::detail::convolve_3x3	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

Referenced by convolve_3x3_dilation(), and load_matrix_row().

◆ convolve_3x3() [2/3]

void convolve_3x3	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		float *	out_ptr,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		unsigned int	stridex,
		int	input_offset = `0`
	)

inline

Perform a convolve3x3 on float32.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[out]	out_ptr	Pointer to the output.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 353 of file NEDirectConvolutionDetail.h.

References accumulate(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_UNUSED, store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
 
     float32x4x2_t out =
     {
         {
             vdupq_n_f32(0.f),
             vdupq_n_f32(0.f)
         }
     };
     if(stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
         const float32x4x2_t vlow     = vld2q_f32(in_low);
         const float32x4_t   vtop_end = vld1q_f32(in_top + 8);
         const float32x4_t   vmid_end = vld1q_f32(in_mid + 8);
         const float32x4_t   vlow_end = vld1q_f32(in_low + 8);
 
         out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
     else
     {
         const float32x4x3_t vtop =
         {
             {
                 vld1q_f32(in_top),
                 vld1q_f32(in_top + 4),
                 vld1q_f32(in_top + 8)
             }
         };
         const float32x4x3_t vmid =
         {
             {
                 vld1q_f32(in_mid),
                 vld1q_f32(in_mid + 4),
                 vld1q_f32(in_mid + 8)
             }
         };
         const float32x4x3_t vlow =
         {
             {
                 vld1q_f32(in_low),
                 vld1q_f32(in_low + 4),
                 vld1q_f32(in_low + 8)
             }
         };
         out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
         out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
 
         out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
 
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
 
         out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
 
         out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
 
         if(stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
         }
         else
         {
             accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
         }
     }
 }

◆ convolve_3x3() [3/3]

void arm_compute::detail::convolve_3x3	(	const T1 *	in_top,
		const T1 *	in_mid,
		const T1 *	in_low,
		T2 *	out_ptr,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		unsigned int	stridex,
		int32_t	input_offset
	)

Perform a convolve3x3 on 8-bit elements.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[out]	out_ptr	Pointer to the output.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 594 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR_ON.

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
     const VectorType vtop =
     {
         {
             wrapper::vload(in_top),
             wrapper::vload(in_top + 8)
         }
     };
     const VectorType vmid =
     {
         {
             wrapper::vload(in_mid),
             wrapper::vload(in_mid + 8)
         }
     };
     const VectorType vlow =
     {
         {
             wrapper::vload(in_low),
             wrapper::vload(in_low + 8)
         }
     };
 
     const int32x4x3_t vtop_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
         }
     };
     const int32x4x3_t vmid_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
         }
     };
     const int32x4x3_t vlow_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
         }
     };
 
     int32x4x2_t out
     {
         {
             wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
             wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
         }
     };
 
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
 
     out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
 
     out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
     out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
 
     // 1
     out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
 
     out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
 
     out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
 
     if(stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
     else if(stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
     else if(stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
     }
 }

◆ convolve_3x3< 1 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 1 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 51 of file NEDirectConvolution3x3.h.

Referenced by convolve_3x3< 2 >(), and convolve_3x3< 3 >().

 {
     const float32x4x3_t vtop =
     {
         {
             vld1q_f32(in_top),
             vld1q_f32(in_top + 4),
             vld1q_f32(in_top + 8)
         }
     };
     const float32x4x3_t vmid =
     {
         {
             vld1q_f32(in_mid),
             vld1q_f32(in_mid + 4),
             vld1q_f32(in_mid + 8)
         }
     };
     const float32x4x3_t vlow =
     {
         {
             vld1q_f32(in_low),
             vld1q_f32(in_low + 4),
             vld1q_f32(in_low + 8)
         }
     };
     float32x4x2_t out =
     {
         {
             vmulq_f32(vtop.val[0], m0.val[0]),
             vmulq_f32(vtop.val[1], m0.val[0])
         }
     };
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
 
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
 
     out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
 
     out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
     out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
     return out;
 }

◆ convolve_3x3< 2 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 2 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 109 of file NEDirectConvolution3x3.h.

References convolve_3x3< 1 >().

 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     return out;
 }

◆ convolve_3x3< 3 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 3 >	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2
	)

inline

Definition at line 119 of file NEDirectConvolution3x3.h.

References convolve_3x3< 1 >(), and store_results().

 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }

◆ convolve_3x3_dilation() [1/2]

float32x4x2_t arm_compute::detail::convolve_3x3_dilation	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		const size_t	dilation_x,
		unsigned int	stridex,
		int	input_offset = `0`
	)

inline

Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 307 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR_ON, convolve_3x3(), and single_convolve_3x3_dilation().

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     float32x4x2_t out =
     {
         {
             single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
             single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
         }
     };
 
     if(stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
     else if(stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
 
     return out;
 }

◆ convolve_3x3_dilation() [2/2]

int32x4x2_t arm_compute::detail::convolve_3x3_dilation	(	const T *	in_top,
		const T *	in_mid,
		const T *	in_low,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		const size_t	dilation_x,
		unsigned int	stridex,
		int	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	stridex	Stride value in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 555 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR_ON, single_convolve_3x3_dilation(), arm_compute::wrapper::vgetlane(), and arm_compute::wrapper::vsetlane().

 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     int32x4x2_t out =
     {
         {
             single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
             single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
         }
     };
 
     if(stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
     else if(stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
     return out;
 }

◆ for_each_error() [1/2]

arm_compute::Status arm_compute::detail::for_each_error ( F && )

inline

Definition at line 108 of file Validate.h.

Referenced by arm_compute::error_on_mismatching_dimensions(), and for_each_error().

 {
     return arm_compute::Status{};
 }

◆ for_each_error() [2/2]

arm_compute::Status arm_compute::detail::for_each_error	(	F &&	func,
		T &&	arg,
		Ts &&...	args
	)

inline

Definition at line 114 of file Validate.h.

References GemmTuner::args, ARM_COMPUTE_RETURN_ON_ERROR, for_each_error(), and func.

 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
     return arm_compute::Status{};
 }

◆ get_input_num_elems_processed() [1/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int num_elems_written_per_iteration )

Referenced by store_results< 3 >().

◆ get_input_num_elems_processed() [2/2]

int arm_compute::detail::get_input_num_elems_processed	(	unsigned int	num_elems_written_per_iteration,
		unsigned int	stridex
	)

inline

__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Get the number of elements processed on 3x3 convolution.

Parameters

[in]	num_elems_written_per_iteration	Number of elements written per iteration on 3x3 convolution.
[in]	stridex	Stride value in elements across x.

Returns: The number of elements processed.

Definition at line 948 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR.

 {
     switch(stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
         case 2:
             return num_elems_written_per_iteration << 1;
         case 3:
             return num_elems_written_per_iteration * 3;
         default:
             ARM_COMPUTE_ERROR("stridex not supported");
             return 0;
     }
 }

◆ get_input_num_elems_processed< 1 >()

int arm_compute::detail::get_input_num_elems_processed< 1 > ( unsigned int num_elems_written_per_iteration )

Definition at line 152 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration;
 }

◆ get_input_num_elems_processed< 2 >()

int arm_compute::detail::get_input_num_elems_processed< 2 > ( unsigned int num_elems_written_per_iteration )

Definition at line 158 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration << 1;
 }

◆ get_input_num_elems_processed< 3 >()

int arm_compute::detail::get_input_num_elems_processed< 3 > ( unsigned int num_elems_written_per_iteration )

Definition at line 164 of file NEDirectConvolution3x3.h.

 {
     return num_elems_written_per_iteration * 3;
 }

◆ have_different_dimensions()

bool arm_compute::detail::have_different_dimensions	(	const Dimensions< T > &	dim1,
		const Dimensions< T > &	dim2,
		unsigned int	upper_dim
	)

inline

Definition at line 51 of file Validate.h.

Referenced by CLPixelWiseMultiplicationKernel::border_size(), arm_compute::error_on_mismatching_shapes(), arm_compute::error_on_tensors_not_even(), arm_compute::error_on_tensors_not_subsampled(), compare_dimension< T >::operator()(), NEPixelWiseMultiplicationKernel::run_op(), and NELogicalKernel::validate().

 {
     for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
         if(dim1[i] != dim2[i])
         {
             return true;
         }
     }
 
     return false;
 }

◆ load_matrix_row() [1/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float * ptr )

inline

Definition at line 34 of file NEDirectConvolution3x3.h.

References convolve_3x3().

 {
     const float32x4x3_t r =
     {
         {
             vld1q_dup_f32(ptr),
             vld1q_dup_f32(1 + ptr),
             vld1q_dup_f32(2 + ptr)
         }
     };
     return r;
 }

◆ load_matrix_row() [2/3]

float32x4x3_t arm_compute::detail::load_matrix_row	(	const float *	ptr,
		int	weights_offset = `0`
	)

inline

Loads a 3x3 matrix as a row (float).

Parameters

[in]	ptr	Pointer to a float 3x3 matrix.
[in]	weights_offset	(Optional) Weights quantization offset.

Returns: The loaded matrix.

Definition at line 46 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_UNUSED.

 {
     ARM_COMPUTE_UNUSED(weights_offset);
     const float32x4x3_t r =
     {
         {
             vld1q_dup_f32(ptr),
             vld1q_dup_f32(1 + ptr),
             vld1q_dup_f32(2 + ptr)
         }
     };
     return r;
 }

◆ load_matrix_row() [3/3]

int32x4x3_t arm_compute::detail::load_matrix_row	(	const T *	ptr,
		int	weights_offset = `0`
	)

inline

Loads a 3x3 matrix as a row (uint8_t/int8_t).

Parameters

[in]	ptr	Pointer to a uint8_t/int8_t 3x3 matrix.
[in]	weights_offset	(Optional) Weights quantization offset.

Returns: The loaded matrix.

Definition at line 68 of file NEDirectConvolutionDetail.h.

References store_results(), store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
 
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
     int32x4x3_t r =
     {
         {
             vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
             vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
             vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
         }
     };
     return r;
 }

◆ single_convolve_3x3_dilation() [1/2]

float32x4_t arm_compute::detail::single_convolve_3x3_dilation	(	const float *	in_top,
		const float *	in_mid,
		const float *	in_low,
		const float32x4x3_t &	m0,
		const float32x4x3_t &	m1,
		const float32x4x3_t &	m2,
		const size_t	dilation_x,
		int	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	input_offset	(Optional) Input quantization offset.

Definition at line 249 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_UNUSED.

Referenced by convolve_3x3_dilation().

 {
     ARM_COMPUTE_UNUSED(input_offset);
 
     const float32x4x3_t vtop =
     {
         {
             vld1q_f32(in_top),
             vld1q_f32(in_top + dilation_x),
             vld1q_f32(in_top + 2 * dilation_x)
         }
     };
     const float32x4x3_t vmid =
     {
         {
             vld1q_f32(in_mid),
             vld1q_f32(in_mid + dilation_x),
             vld1q_f32(in_mid + 2 * dilation_x)
         }
     };
     const float32x4x3_t vlow =
     {
         {
             vld1q_f32(in_low),
             vld1q_f32(in_low + dilation_x),
             vld1q_f32(in_low + 2 * dilation_x)
         }
     };
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
 
     out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
     out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
     out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
 
     out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
     out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
     out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
 
     return out;
 }

◆ single_convolve_3x3_dilation() [2/2]

int32x4_t arm_compute::detail::single_convolve_3x3_dilation	(	const T *	in_top,
		const T *	in_mid,
		const T *	in_low,
		const int32x4x3_t &	m0,
		const int32x4x3_t &	m1,
		const int32x4x3_t &	m2,
		size_t	dilation_x,
		int32_t	input_offset
	)

inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters

[in]	in_top	Pointer to the first row of the input.
[in]	in_mid	Pointer to the second row of the input.
[in]	in_low	Pointer to the third row of the input.
[in]	m0	First row of the filter.
[in]	m1	Second row of the filter.
[in]	m2	Third row of the filter.
[in]	dilation_x	Dilation, in elements across x.
[in]	input_offset	Input quantization offset.

Definition at line 467 of file NEDirectConvolutionDetail.h.

 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
     const VectorType vtop =
     {
         {
             wrapper::vload(in_top),
             wrapper::vload(in_top + dilation_x),
             wrapper::vload(in_top + 2 * dilation_x)
         }
     };
     const VectorType vmid =
     {
         {
             wrapper::vload(in_mid),
             wrapper::vload(in_mid + dilation_x),
             wrapper::vload(in_mid + 2 * dilation_x)
         }
     };
     const VectorType vlow =
     {
         {
             wrapper::vload(in_low),
             wrapper::vload(in_low + dilation_x),
             wrapper::vload(in_low + 2 * dilation_x)
         }
     };
 
     const int32x4x3_t vtop_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
         }
     };
     const int32x4x3_t vmid_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
         }
     };
     const int32x4x3_t vlow_s32 =
     {
         {
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
             wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
         }
     };
 
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
     out           = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
 
     out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
     out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
     out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
 
     out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
     out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
     out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
 
     return out;
 }

◆ store_results() [1/2]

void arm_compute::detail::store_results	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

Stores a uint32_t array into a memory location.

Parameters

[in]	buffer	Pointer to the memory location where the values will be stored.
[in]	values	Values that will be stored.

◆ store_results() [2/2]

void store_results	(	float *	buffer,
		const float32x4x2_t &	values
	)

Stores a float32x4x2_t array into a memory location.

Parameters

[in]	buffer	Pointer to the memory location where the values will be stored.
[in]	values	Values that will be stored.

Referenced by accumulate_results< 3 >(), NEConvolutionKernel< matrix_size >::configure(), convolve_3x3< 3 >(), load_matrix_row(), NESeparableConvolutionVertKernel< matrix_size >::run(), and NEConvolutionRectangleKernel::run().

◆ store_results< 1 >() [1/2]

void arm_compute::detail::store_results< 1 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 123 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, values.val[0]);
     vst1q_s32(buffer + 4, values.val[1]);
 }

◆ store_results< 1 >() [2/2]

void store_results< 1 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 130 of file NEDirectConvolution3x3.h.

Referenced by accumulate_results< 3 >(), convolve_3x3(), and load_matrix_row().

 {
     vst1q_f32(buffer, values.val[0]);
     vst1q_f32(buffer + 4, values.val[1]);
 }

◆ store_results< 2 >() [1/2]

void arm_compute::detail::store_results< 2 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 130 of file NEDirectConvolutionDetail.h.

 {
     vst1q_s32(buffer, values.val[0]);
 }

◆ store_results< 2 >() [2/2]

void store_results< 2 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 137 of file NEDirectConvolution3x3.h.

Referenced by accumulate_results< 3 >(), convolve_3x3(), and load_matrix_row().

 {
     vst1q_f32(buffer, values.val[0]);
 }

◆ store_results< 3 >() [1/2]

void arm_compute::detail::store_results< 3 >	(	int32_t *	buffer,
		const int32x4x2_t &	values
	)

inline

Definition at line 136 of file NEDirectConvolutionDetail.h.

References accumulate_results().

 {
     vst1_s32(buffer, vget_low_s32(values.val[0]));
 }

◆ store_results< 3 >() [2/2]

void store_results< 3 >	(	float *	buffer,
		const float32x4x2_t &	values
	)

inline

Definition at line 143 of file NEDirectConvolution3x3.h.

References get_input_num_elems_processed().

Referenced by accumulate_results< 3 >(), convolve_3x3(), and load_matrix_row().

 {
     vst1_f32(buffer, vget_low_f32(values.val[0]));
 }

Data Structures

Functions

Function Documentation

◆ accumulate_results() [1/2]

◆ accumulate_results() [2/2]

◆ accumulate_results< 1 >() [1/2]

◆ accumulate_results< 1 >() [2/2]

◆ accumulate_results< 2 >() [1/2]

◆ accumulate_results< 2 >() [2/2]

◆ accumulate_results< 3 >() [1/2]

◆ accumulate_results< 3 >() [2/2]

◆ convolve_3x3() [1/3]

◆ convolve_3x3() [2/3]

◆ convolve_3x3() [3/3]

◆ convolve_3x3< 1 >()

◆ convolve_3x3< 2 >()

◆ convolve_3x3< 3 >()

◆ convolve_3x3_dilation() [1/2]

◆ convolve_3x3_dilation() [2/2]

◆ for_each_error() [1/2]

◆ for_each_error() [2/2]

◆ get_input_num_elems_processed() [1/2]

◆ get_input_num_elems_processed() [2/2]

◆ get_input_num_elems_processed< 1 >()

◆ get_input_num_elems_processed< 2 >()

◆ get_input_num_elems_processed< 3 >()

◆ have_different_dimensions()

◆ load_matrix_row() [1/3]

◆ load_matrix_row() [2/3]

◆ load_matrix_row() [3/3]

◆ single_convolve_3x3_dilation() [1/2]

◆ single_convolve_3x3_dilation() [2/2]

◆ store_results() [1/2]

◆ store_results() [2/2]

◆ store_results< 1 >() [1/2]

◆ store_results< 1 >() [2/2]

◆ store_results< 2 >() [1/2]

◆ store_results< 2 >() [2/2]

◆ store_results< 3 >() [1/2]

◆ store_results< 3 >() [2/2]