CMSIS-NN  Version 3.0.0
CMSIS NN Software Library
 All Data Structures Files Functions Variables Enumerations Enumerator Macros Groups Pages
arm_nnsupportfunctions.h File Reference

Data Structures

union  arm_nnword
 Union for SIMD access of q31/q15/q7 types. More...
 
struct  arm_nn_double
 Union for data type long long. More...
 
union  arm_nn_long_long
 

Macros

#define LEFT_SHIFT(_shift)
 
#define RIGHT_SHIFT(_shift)
 
#define MASK_IF_ZERO(x)
 
#define MASK_IF_NON_ZERO(x)
 
#define SELECT_USING_MASK(mask, a, b)
 
#define MAX(A, B)
 
#define MIN(A, B)
 
#define CLAMP(x, h, l)
 
#define NN_ROUND(out_shift)
 macro for adding rounding offset More...
 
#define MUL_SAT(a, b)
 
#define MUL_SAT_MVE(a, b)
 
#define MUL_POW2(a, b)
 
#define DIV_POW2(a, b)
 
#define DIV_POW2_MVE(a, b)
 
#define EXP_ON_NEG(x)
 
#define ONE_OVER1(x)
 
#define SELECT_IF_NON_ZERO(x)
 

Functions

void arm_q7_to_q15_no_shift (const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
 Converts the elements of the q7 vector to q15 vector without left-shift. More...
 
void arm_nn_add_q7 (const q7_t *input, q31_t *output, uint32_t block_size)
 Non-saturating addition of elements of a q7 vector. More...
 
void arm_q7_to_q15_reordered_no_shift (const q7_t *pSrc, q15_t *pDst, uint32_t blockSize)
 Converts the elements of the q7 vector to reordered q15 vector without left-shift. More...
 
void arm_q7_to_q15_with_offset (const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset)
 Converts the elements from a q7 vector to a q15 vector with an added offset. More...
 
void arm_q7_to_q15_reordered_with_offset (const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset)
 Converts the elements of the q7 vector to reordered q15 vector with an added offset. More...
 
void arm_nn_accumulate_q7_to_q15 (q15_t *dst, const q7_t *src, uint32_t block_size)
 Converts the elements from a q7 vector and accumulate to a q15 vector. More...
 
q7_t * arm_nn_depthwise_conv_s8_core (const q7_t *row, const q15_t *col, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t kernel_size, const int32_t *const output_bias, q7_t *out)
 Depthwise conv on an im2col buffer where the input channel equals output channel. More...
 
q7_t * arm_nn_mat_mult_s8 (const q7_t *input_row, const q7_t *input_col, const uint16_t output_ch, const uint16_t col_batches, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t col_offset, const int32_t row_offset, const int16_t out_activation_min, const int16_t out_activation_max, const uint16_t row_len, const int32_t *const bias, q7_t *out)
 General Matrix-multiplication function with per-channel requantization. More...
 
arm_status arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output)
 General Matrix-multiplication without requantization for one row & one column. More...
 
arm_status arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output)
 General Matrix-multiplication without requantization for four rows and one column. More...
 
arm_status arm_nn_mat_mult_nt_t_s8 (const q7_t *lhs, const q7_t *rhs, const q31_t *bias, q7_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max)
 General Matrix-multiplication function with per-channel requantization. This function assumes: More...
 
arm_status arm_nn_vec_mat_mult_t_s8 (const q7_t *lhs, const q7_t *rhs, const q31_t *bias, q7_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s8 Vector by Matrix (transposed) multiplication More...
 
arm_status arm_nn_vec_mat_mult_t_svdf_s8 (const q7_t *lhs, const q7_t *rhs, q15_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t scatter_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s8 Vector by Matrix (transposed) multiplication with s16 output More...
 
q7_t * arm_nn_depthwise_conv_nt_t_padded_s8 (const q7_t *lhs, const q7_t *rhs, const int32_t lhs_offset, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, q7_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. More...
 
q7_t * arm_nn_depthwise_conv_nt_t_s8 (const q7_t *lhs, const q7_t *rhs, const int32_t lhs_offset, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, q7_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia (const q15_t **in_q15)
 Read 2 q15 elements and post increment pointer. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia (const q7_t **in_q7)
 Read 4 q7 from q7 pointer and post increment pointer. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2 (const q15_t *in_q15)
 Read 2 q15 from q15 pointer. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4 (const q7_t *in_q7)
 Read 4 q7 values. More...
 
__STATIC_FORCEINLINE void arm_memset_q7 (q7_t *dst, const q7_t val, uint32_t block_size)
 memset optimized for MVE More...
 
void arm_nn_mult_q15 (q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 q7 vector multiplication with variable output shifts More...
 
void arm_nn_mult_q7 (q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 q7 vector multiplication with variable output shifts More...
 
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult (const q31_t m1, const q31_t m2)
 Saturating doubling high multiply. Result matches NEON instruction VQRDMULH. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat (const q31_t m1, const q31_t m2)
 Doubling high multiply without saturation. This is intended for requantization where the scale is a positive integer. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two (const q31_t dividend, const q31_t exponent)
 Rounding divide by power of two. More...
 
__STATIC_FORCEINLINE q31_t arm_nn_requantize (const q31_t val, const q31_t multiplier, const q31_t shift)
 Requantize a given value. More...
 
__STATIC_FORCEINLINE void arm_memcpy_q7 (q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size)
 memcpy optimized for MVE More...
 
__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values (int32_t val)
 
__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two (const int32_t val, const int32_t exp)
 
__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1 (int32_t val)
 
__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia (q15_t **dest_q15, q31_t src_q31)
 Write 2 q15 elements and post increment pointer. More...
 

Macro Definition Documentation

#define CLAMP (   x,
  h,
 
)
#define DIV_POW2 (   a,
 
)
#define DIV_POW2_MVE (   a,
 
)

Referenced by arm_softmax_s8().

#define EXP_ON_NEG (   x)

Referenced by arm_softmax_s8(), and arm_softmax_u8().

#define LEFT_SHIFT (   _shift)

Referenced by arm_nn_requantize().

#define MASK_IF_NON_ZERO (   x)
#define MASK_IF_ZERO (   x)
#define MUL_POW2 (   a,
 
)
#define MUL_SAT_MVE (   a,
 
)

Referenced by arm_softmax_s8().

#define ONE_OVER1 (   x)

Referenced by arm_softmax_s8(), and arm_softmax_u8().

#define RIGHT_SHIFT (   _shift)

Referenced by arm_nn_requantize().

#define SELECT_IF_NON_ZERO (   x)
#define SELECT_USING_MASK (   mask,
  a,
 
)

Function Documentation

__STATIC_FORCEINLINE void arm_memcpy_q7 ( q7_t *__RESTRICT  dst,
const q7_t *__RESTRICT  src,
uint32_t  block_size 
)
Parameters
[in,out]dstDestination pointer
[in]srcSource pointer.
[in]block_sizeNumber of bytes to copy.

Referenced by arm_convolve_s8(), and arm_depthwise_conv_s8_opt().

__STATIC_FORCEINLINE void arm_memset_q7 ( q7_t *  dst,
const q7_t  val,
uint32_t  block_size 
)
Parameters
[in,out]dstDestination pointer
[in]valValue to set
[in]block_sizeNumber of bytes to copy.

Referenced by arm_depthwise_conv_s8_opt().

q7_t* arm_nn_depthwise_conv_s8_core ( const q7_t *  row,
const q15_t *  col,
const uint16_t  num_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int32_t  activation_min,
const int32_t  activation_max,
const uint16_t  kernel_size,
const int32_t *const  output_bias,
q7_t *  out 
)
Parameters
[in]rowpointer to row
[in]colpointer to im2col buffer, always consists of 2 columns.
[in]num_chnumber of channels
[in]out_shiftpointer to per output channel requantization shift parameter.
[in]out_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]activation_minminimum value to clamp the output to. Range : int8
[in]activation_maxmaximum value to clamp the output to. Range : int8
[in]kernel_sizenumber of elements in one column.
[in]output_biasper output channel bias. Range : int32
[out]outpointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

Supported framework: TensorFlow Lite micro.

__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two ( const q31_t  dividend,
const q31_t  exponent 
)
Parameters
[in]dividend- Dividend
[in]exponent- Divisor = power(2, exponent) Range: [0, 31]
Returns
Rounded result of division. Midpoint is rounded away from zero.

Referenced by arm_elementwise_add_s8(), and arm_nn_requantize().

__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult ( const q31_t  m1,
const q31_t  m2 
)
Parameters
[in]m1Multiplicand. Range: {Q31_MIN, Q31_MAX}
[in]m2Multiplier. Range: {Q31_MIN, Q31_MAX}
Returns
Result of multiplication.

Referenced by arm_elementwise_add_s8().

__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat ( const q31_t  m1,
const q31_t  m2 
)
Parameters
[in]m1Multiplicand. Range: {Q31_MIN, Q31_MAX}
[in]m2Multiplier Range: {Q31_MIN, Q31_MAX}
Returns
Result of multiplication.
Note
The result of this matches that of neon instruction VQRDMULH for m1 in range {Q31_MIN, Q31_MAX} and m2 in range {Q31_MIN + 1, Q31_MAX}. Saturation occurs when m1 equals m2 equals Q31_MIN and that is not handled by this function.

References arm_nn_double::high, arm_nn_long_long::long_long, arm_nn_double::low, and arm_nn_long_long::word.

Referenced by arm_nn_requantize().

__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values ( int32_t  val)
q7_t* arm_nn_mat_mult_s8 ( const q7_t *  input_row,
const q7_t *  input_col,
const uint16_t  output_ch,
const uint16_t  col_batches,
const int32_t *  output_shift,
const int32_t *  output_mult,
const int32_t  out_offset,
const int32_t  col_offset,
const int32_t  row_offset,
const int16_t  out_activation_min,
const int16_t  out_activation_max,
const uint16_t  row_len,
const int32_t *const  bias,
q7_t *  out 
)
Parameters
[in]input_rowpointer to row operand
[in]input_colpointer to col operand
[in]output_chnumber of rows of input_row
[in]col_batchesnumber of column batches. Range: 1 to 4
[in]output_shiftpointer to per output channel requantization shift parameter.
[in]output_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]col_offsetinput tensor(col) offset.
[in]row_offsetkernel offset(row). Not used.
[in]out_activation_minminimum value to clamp the output to. Range : int8
[in]out_activation_maxmaximum value to clamp the output to. Range : int8
[in]row_lennumber of elements in each row
[in]biasper output channel bias. Range : int32
[in,out]outpointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

Supported framework: TensorFlow Lite

References arm_nn_requantize(), MAX, and MIN.

Referenced by arm_convolve_s8().

__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two ( const int32_t  val,
const int32_t  exp 
)
__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1 ( int32_t  val)

References MUL_POW2, and MUL_SAT.

__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2 ( const q15_t *  in_q15)
Parameters
[in]in_q15pointer to address of input.
Returns
q31 value

Referenced by arm_depthwise_conv_s8_opt(), and arm_nn_accumulate_q7_to_q15().

__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4 ( const q7_t *  in_q7)
__STATIC_FORCEINLINE q31_t arm_nn_requantize ( const q31_t  val,
const q31_t  multiplier,
const q31_t  shift 
)
__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia ( q15_t **  dest_q15,
q31_t  src_q31 
)
Parameters
[in]dest_q15Pointer to pointer that holds address of destination.
[in]src_q31Input value to be written.
Returns
none

Referenced by arm_nn_accumulate_q7_to_q15(), arm_q7_to_q15_no_shift(), arm_q7_to_q15_reordered_with_offset(), arm_q7_to_q15_with_offset(), and arm_relu_q15().