CMSIS-NN  Version 3.0.0
CMSIS NN Software Library
 All Data Structures Files Functions Variables Enumerations Enumerator Macros Groups Pages
Basic Math Functions for Neural Network Computation

Functions

void arm_nn_accumulate_q7_to_q15 (q15_t *pDst, const q7_t *pSrc, uint32_t length)
 Converts the elements from a q7 vector and accumulate to a q15 vector. More...
 
void arm_nn_add_q7 (const q7_t *input, q31_t *output, uint32_t block_size)
 Non-saturating addition of elements of a q7 vector. More...
 
q7_t * arm_nn_depthwise_conv_nt_t_padded_s8 (const q7_t *lhs, const q7_t *rhs, const int32_t input_offset, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, q7_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. More...
 
q7_t * arm_nn_depthwise_conv_nt_t_s8 (const q7_t *lhs, const q7_t *rhs, const int32_t input_offset, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, q7_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More...
 
arm_status arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output)
 General Matrix-multiplication without requantization for one row & one column. More...
 
arm_status arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output)
 General Matrix-multiplication without requantization for four rows and one column. More...
 
arm_status arm_nn_mat_mult_nt_t_s8 (const q7_t *lhs, const q7_t *rhs, const q31_t *bias, q7_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max)
 General Matrix-multiplication function with per-channel requantization. This function assumes: More...
 
void arm_nn_mult_q15 (q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 Q7 vector multiplication with variable output shifts. More...
 
void arm_nn_mult_q7 (q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize)
 Q7 vector multiplication with variable output shifts. More...
 
arm_status arm_nn_vec_mat_mult_t_s8 (const q7_t *lhs, const q7_t *rhs, const q31_t *bias, q7_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s8 Vector by Matrix (transposed) multiplication More...
 
arm_status arm_nn_vec_mat_mult_t_svdf_s8 (const q7_t *lhs, const q7_t *rhs, q15_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s8 Vector by Matrix (transposed) multiplication with s16 output More...
 

Description

Basic Math Functions for Neural Network Computation

Function Documentation

void arm_nn_accumulate_q7_to_q15 ( q15_t *  dst,
const q7_t *  src,
uint32_t  block_size 
)
Parameters
[in]*srcpoints to the q7 input vector
[out]*dstpoints to the q15 output vector
[in]block_sizelength of the input vector
Description:

The equation used for the conversion process is:

 dst[n] += (q15_t) src[n] ;   0 <= n < block_size.

References arm_nn_read_q15x2(), arm_nn_read_q7x4_ia(), and arm_nn_write_q15x2_ia().

void arm_nn_add_q7 ( const q7_t *  input,
q31_t *  output,
uint32_t  block_size 
)
Parameters
[in]*inputPointer to the q7 input vector
[out]*outputPointer to the q31 output variable.
[in]block_sizelength of the input vector
Description:

2^24 samples can be added without saturating the result.

The equation used for the conversion process is:

 sum = input[0] + input[1] + .. + input[block_size -1]

References arm_nn_read_q7x4_ia().

q7_t* arm_nn_depthwise_conv_nt_t_padded_s8 ( const q7_t *  lhs,
const q7_t *  rhs,
const int32_t  lhs_offset,
const uint16_t  num_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int32_t  activation_min,
const int32_t  activation_max,
const uint16_t  row_x_col,
const int32_t *const  output_bias,
q7_t *  out 
)
Parameters
[in]lhsInput left-hand side matrix
[in]rhsInput right-hand side matrix (transposed)
[in]lhs_offsetLHS matrix offset(input offset). Range: -127 to 128
[in]num_chNumber of channels in LHS/RHS
[in]out_shiftPer channel output shift. Length of vector is equal to number of channels
[in]out_multPer channel output multiplier. Length of vector is equal to number of channels
[in]out_offsetOffset to be added to the output values. Range: -127 to 128
[in]activation_minMinimum value to clamp the output to. Range: int8
[in]activation_maxMaximum value to clamp the output to. Range: int8
[in]row_x_col(row_dimension * col_dimension) of LHS/RHS matrix
[in]output_biasPer channel output bias. Length of vector is equal to number of channels
[in]outOutput pointer
Returns
The function returns one of the two
  • Updated output pointer if an implementation is available
  • NULL if no implementation is available.
Note
If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.
  • Output shift
  • Output multiplier
  • Output bias
  • rhs

Referenced by arm_depthwise_conv_s8_opt().

q7_t* arm_nn_depthwise_conv_nt_t_s8 ( const q7_t *  lhs,
const q7_t *  rhs,
const int32_t  lhs_offset,
const uint16_t  num_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int32_t  activation_min,
const int32_t  activation_max,
const uint16_t  row_x_col,
const int32_t *const  output_bias,
q7_t *  out 
)
Parameters
[in]lhsInput left-hand side matrix
[in]rhsInput right-hand side matrix (transposed)
[in]lhs_offsetLHS matrix offset(input offset). Range: -127 to 128
[in]num_chNumber of channels in LHS/RHS
[in]out_shiftPer channel output shift. Length of vector is equal to number of channels.
[in]out_multPer channel output multiplier. Length of vector is equal to number of channels.
[in]out_offsetOffset to be added to the output values. Range: -127 to 128
[in]activation_minMinimum value to clamp the output to. Range: int8
[in]activation_maxMaximum value to clamp the output to. Range: int8
[in]row_x_col(row_dimension * col_dimension) of LHS/RHS matrix
[in]output_biasPer channel output bias. Length of vector is equal to number of channels.
[in]outOutput pointer
Returns
The function returns one of the two
  • Updated output pointer if an implementation is available
  • NULL if no implementation is available.
Note
If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.
  • Output shift
  • Output multiplier
  • Output bias
  • rhs

Referenced by arm_depthwise_conv_s8_opt().

arm_status arm_nn_mat_mul_core_1x_s8 ( int32_t  row_elements,
const int8_t *  row_base,
const int8_t *  col_base,
int32_t *const  sum_col,
int32_t *const  output 
)
Parameters
[in]row_elementsnumber of row elements
[in]row_basepointer to row operand
[in]col_basepointer to col operand
[out]sum_colpointer to store sum of column elements
[out]outputpointer to store result of multiply-accumulate
Returns
The function returns the multiply-accumulated result of the row by column.

Pseudo-code *output = 0 sum_col = 0 for (i = 0; i < row_elements; i++) *output += row_base[i] * col_base[i] sum_col += col_base[i]

Referenced by arm_convolve_1_x_n_s8(), and arm_convolve_1x1_s8_fast().

arm_status arm_nn_mat_mul_core_4x_s8 ( const int32_t  row_elements,
const int32_t  offset,
const int8_t *  row_base,
const int8_t *  col_base,
int32_t *const  sum_col,
int32_t *const  output 
)
Parameters
[in]row_elementsnumber of row elements
[in]offsetoffset between rows. Can be the same as row_elements. For e.g, in a 1x1 conv scenario with stride as 1.
[in]row_basepointer to row operand
[in]col_basepointer to col operand
[out]sum_colpointer to store sum of column elements
[out]outputpointer to store result(4 int32's) of multiply-accumulate
Returns
The function returns the multiply-accumulated result of the row by column

Pseudo-code output[0] = 0 .. output[3] = 0 sum_col = 0 for (i = 0; i < row_elements; i++) output[0] += row_base[i] * col_base[i] .. output[3] += row_base[i + (row_elements * 3)] * col_base[i] sum_col += col_base[i]

Referenced by arm_convolve_1_x_n_s8(), arm_convolve_1x1_s8_fast(), and arm_convolve_s8().

arm_status arm_nn_mat_mult_nt_t_s8 ( const q7_t *  lhs,
const q7_t *  rhs,
const q31_t *  bias,
q7_t *  dst,
const int32_t *  dst_multipliers,
const int32_t *  dst_shifts,
const int32_t  lhs_rows,
const int32_t  rhs_rows,
const int32_t  rhs_cols,
const int32_t  lhs_offset,
const int32_t  dst_offset,
const int32_t  activation_min,
const int32_t  activation_max 
)
  • LHS input matrix NOT transposed (nt)
  • RHS input matrix transposed (t)
Note
This operation also performs the broadcast bias addition before the requantization
Parameters
[in]lhsPointer to the LHS input matrix
[in]rhsPointer to the RHS input matrix
[in]biasPointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
[out]dstPointer to the output matrix with "m" rows and "n" columns
[in]dst_multipliersPointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]dst_shiftsPointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]lhs_rowsNumber of LHS input rows
[in]rhs_rowsNumber of RHS input rows
[in]rhs_colsNumber of LHS/RHS input columns
[in]lhs_offsetOffset to be applied to the LHS input value
[in]dst_offsetOffset to be applied the output result
[in]activation_minMinimum value to clamp down the output. Range : int8
[in]activation_maxMaximum value to clamp up the output. Range : int8
Returns
The function returns ARM_MATH_SUCCESS

References arm_nn_read_q7x4(), arm_nn_read_q7x4_ia(), arm_nn_requantize(), MAX, and MIN.

Referenced by arm_convolve_1x1_s8_fast().

void arm_nn_mult_q15 ( q15_t *  pSrcA,
q15_t *  pSrcB,
q15_t *  pDst,
const uint16_t  out_shift,
uint32_t  blockSize 
)

q7 vector multiplication with variable output shifts

Parameters
[in]*pSrcApointer to the first input vector
[in]*pSrcBpointer to the second input vector
[out]*pDstpointer to the output vector
[in]out_shiftamount of right-shift for output
[in]blockSizenumber of samples in each vector

Scaling and Overflow Behavior:

The function uses saturating arithmetic. Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.

References arm_nn_read_q15x2_ia(), and NN_ROUND.

void arm_nn_mult_q7 ( q7_t *  pSrcA,
q7_t *  pSrcB,
q7_t *  pDst,
const uint16_t  out_shift,
uint32_t  blockSize 
)

q7 vector multiplication with variable output shifts

Parameters
[in]*pSrcApointer to the first input vector
[in]*pSrcBpointer to the second input vector
[out]*pDstpointer to the output vector
[in]out_shiftamount of right-shift for output
[in]blockSizenumber of samples in each vector

Scaling and Overflow Behavior:

The function uses saturating arithmetic. Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.

References NN_ROUND.

arm_status arm_nn_vec_mat_mult_t_s8 ( const q7_t *  lhs,
const q7_t *  rhs,
const q31_t *  bias,
q7_t *  dst,
const int32_t  lhs_offset,
const int32_t  rhs_offset,
const int32_t  dst_offset,
const int32_t  dst_multiplier,
const int32_t  dst_shift,
const int32_t  rhs_cols,
const int32_t  rhs_rows,
const int32_t  activation_min,
const int32_t  activation_max 
)
Parameters
[in]lhsInput left-hand side vector
[in]rhsInput right-hand side matrix (transposed)
[in]biasInput bias
[out]dstOutput vector
[in]lhs_offsetOffset to be added to the input values of the left-hand side vector. Range: -127 to 128
[in]rhs_offsetNot used
[in]dst_offsetOffset to be added to the output values. Range: -127 to 128
[in]dst_multiplierOutput multiplier
[in]dst_shiftOutput shift
[in]rhs_colsNumber of columns in the right-hand side input matrix
[in]rhs_rowsNumber of rows in the right-hand side input matrix
[in]activation_minMinimum value to clamp the output to. Range: int8
[in]activation_maxMaximum value to clamp the output to. Range: int8
Returns
The function returns ARM_MATH_SUCCESS

References arm_nn_read_q7x4_ia(), arm_nn_requantize(), MAX, and MIN.

Referenced by arm_fully_connected_s8().

arm_status arm_nn_vec_mat_mult_t_svdf_s8 ( const q7_t *  lhs,
const q7_t *  rhs,
q15_t *  dst,
const int32_t  lhs_offset,
const int32_t  rhs_offset,
const int32_t  scatter_offset,
const int32_t  dst_multiplier,
const int32_t  dst_shift,
const int32_t  rhs_cols,
const int32_t  rhs_rows,
const int32_t  activation_min,
const int32_t  activation_max 
)
Parameters
[in]lhsInput left-hand side vector
[in]rhsInput right-hand side matrix (transposed)
[out]dstOutput vector
[in]lhs_offsetOffset to be added to the input values of the left-hand side vector. Range: -127 to 128
[in]rhs_offsetNot used
[in]scatter_offsetAddress offset for dst. First output is stored at 'dst', the second at 'dst + scatter_offset' and so on.
[in]dst_multiplierOutput multiplier
[in]dst_shiftOutput shift
[in]rhs_colsNumber of columns in the right-hand side input matrix
[in]rhs_rowsNumber of rows in the right-hand side input matrix
[in]activation_minMinimum value to clamp the output to. Range: int16
[in]activation_maxMaximum value to clamp the output to. Range: int16
Returns
The function returns ARM_MATH_SUCCESS

References arm_nn_read_q7x4_ia(), arm_nn_requantize(), MAX, and MIN.

Referenced by arm_svdf_s8().