Compute Library
 21.05
NEDirectConvolutionDetail.h File Reference
#include "src/core/NEON/NEFixedPoint.h"
#include "src/core/NEON/wrapper/wrapper.h"
#include "support/Requires.h"
#include <arm_neon.h>

Go to the source code of this file.

Namespaces

 arm_compute
 Copyright (c) 2017-2021 Arm Limited.
 
 arm_compute::detail
 

Functions

float32x4x3_t load_matrix_row (const float *ptr, int weights_offset=0)
 Loads a 3x3 matrix as a row (float). More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4x3_t load_matrix_row (const T *ptr, int weights_offset=0)
 Loads a 3x3 matrix as a row (uint8_t/int8_t). More...
 
template<unsigned int stridex>
void store_results (float *buffer, const float32x4x2_t &values)
 Stores a float32x4x2_t array into a memory location. More...
 
template<>
void store_results< 1 > (float *buffer, const float32x4x2_t &values)
 
template<>
void store_results< 2 > (float *buffer, const float32x4x2_t &values)
 
template<>
void store_results< 3 > (float *buffer, const float32x4x2_t &values)
 
template<unsigned int stridex>
void store_results (int32_t *buffer, const int32x4x2_t &values)
 Stores a uint32_t array into a memory location. More...
 
template<>
void store_results< 1 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void store_results< 2 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void store_results< 3 > (int32_t *buffer, const int32x4x2_t &values)
 
template<unsigned int stridex>
void accumulate_results (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 1 > (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 2 > (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 3 > (float *buffer, const float32x4x2_t &values)
 
template<unsigned int stridex>
void accumulate_results (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 1 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 2 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 3 > (int32_t *buffer, const int32x4x2_t &values)
 
float32x4_t single_convolve_3x3_dilation (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, int input_offset)
 Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...
 
float32x4x2_t convolve_3x3_dilation (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
 Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...
 
template<bool accumulate>
void convolve_3x3 (const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, unsigned int stridex, int input_offset=0)
 Perform a convolve3x3 on float32. More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4_t single_convolve_3x3_dilation (const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
 Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4x2_t convolve_3x3_dilation (const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset)
 Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...
 
template<bool accumulate, typename T1 , typename T2 , ARM_COMPUTE_REQUIRES_TA(std::is_same< T1, uint8_t >::value||std::is_same< T1, int8_t >::value) >
void convolve_3x3 (const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, unsigned int stridex, int32_t input_offset)
 Perform a convolve3x3 on 8-bit elements. More...
 
int get_input_num_elems_processed (unsigned int num_elems_written_per_iteration, unsigned int stridex)
 __ARM_FEATURE_FP16_VECTOR_ARITHMETIC More...