Functions
void	arm_nn_accumulate_q7_to_q15 (q15_t pDst, const q7_t pSrc, uint32_t length)
	Converts the elements from a q7 vector and accumulate to a q15 vector. More...

void	arm_nn_add_q7 (const q7_t input, q31_t output, uint32_t block_size)
	Non-saturating addition of elements of a q7 vector. More...

q7_t *	arm_nn_depthwise_conv_nt_t_padded_s8 (const q7_t lhs, const q7_t rhs, const int32_t input_offset, const uint16_t num_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, q7_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. More...

q7_t *	arm_nn_depthwise_conv_nt_t_s8 (const q7_t lhs, const q7_t rhs, const int32_t input_offset, const uint16_t num_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, q7_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More...

arm_status	arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int8_t row_base, const int8_t col_base, int32_t const sum_col, int32_t const output)
	General Matrix-multiplication without requantization for one row & one column. More...

int8_t *	arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t row_base, const int8_t col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params conv_params, const cmsis_nn_per_channel_quant_params quant_params, const int32_t bias, int8_t output)
	Matrix-multiplication with requantization & activation function for four rows and one column. More...

arm_status	arm_nn_mat_mult_nt_t_s8 (const q7_t lhs, const q7_t rhs, const q31_t bias, q7_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max)
	General Matrix-multiplication function with per-channel requantization. This function assumes: More...

void	arm_nn_mult_q15 (q15_t pSrcA, q15_t pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize)
	Q7 vector multiplication with variable output shifts. More...

void	arm_nn_mult_q7 (q7_t pSrcA, q7_t pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize)
	Q7 vector multiplication with variable output shifts. More...

arm_status	arm_nn_vec_mat_mult_t_s16 (const q15_t lhs, const q7_t rhs, const q63_t bias, q15_t dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
	s16 Vector by Matrix (transposed) multiplication More...

arm_status	arm_nn_vec_mat_mult_t_s8 (const q7_t lhs, const q7_t rhs, const q31_t bias, q7_t dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, const int32_t address_offset)
	s8 Vector by Matrix (transposed) multiplication More...

arm_status	arm_nn_vec_mat_mult_t_svdf_s8 (const q7_t lhs, const q7_t rhs, q15_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
	s8 Vector by Matrix (transposed) multiplication with s16 output More...

Description

Basic Math Functions for Neural Network Computation

Function Documentation

void arm_nn_accumulate_q7_to_q15	(	q15_t *	dst,
		const q7_t *	src,
		uint32_t	block_size
	)

Parameters

[in]	*src	points to the q7 input vector
[out]	*dst	points to the q15 output vector
[in]	block_size	length of the input vector

Description:

The equation used for the conversion process is:

 dst[n] += (q15_t) src[n] ;   0 <= n < block_size.

References arm_nn_read_q15x2(), arm_nn_read_q7x4_ia(), and arm_nn_write_q15x2_ia().

void arm_nn_add_q7	(	const q7_t *	input,
		q31_t *	output,
		uint32_t	block_size
	)

Parameters

[in]	*input	Pointer to the q7 input vector
[out]	*output	Pointer to the q31 output variable.
[in]	block_size	length of the input vector

Description:

2^24 samples can be added without saturating the result.

The equation used for the conversion process is:

 sum = input[0] + input[1] + .. + input[block_size -1]

References arm_nn_read_q7x4_ia().

q7_t* arm_nn_depthwise_conv_nt_t_padded_s8	(	const q7_t *	lhs,
		const q7_t *	rhs,
		const int32_t	lhs_offset,
		const uint16_t	num_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	row_x_col,
		const int32_t *const	output_bias,
		q7_t *	out
	)

Parameters

[in]	lhs	Input left-hand side matrix
[in]	rhs	Input right-hand side matrix (transposed)
[in]	lhs_offset	LHS matrix offset(input offset). Range: -127 to 128
[in]	num_ch	Number of channels in LHS/RHS
[in]	out_shift	Per channel output shift. Length of vector is equal to number of channels
[in]	out_mult	Per channel output multiplier. Length of vector is equal to number of channels
[in]	out_offset	Offset to be added to the output values. Range: -127 to 128
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	row_x_col	(row_dimension * col_dimension) of LHS/RHS matrix
[in]	output_bias	Per channel output bias. Length of vector is equal to number of channels
[in]	out	Output pointer

Returns

The function returns one of the two

Updated output pointer if an implementation is available
NULL if no implementation is available.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.

Output shift
Output multiplier
Output bias
rhs

Referenced by arm_depthwise_conv_s8_opt().

q7_t* arm_nn_depthwise_conv_nt_t_s8	(	const q7_t *	lhs,
		const q7_t *	rhs,
		const int32_t	lhs_offset,
		const uint16_t	num_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	row_x_col,
		const int32_t *const	output_bias,
		q7_t *	out
	)

Parameters

[in]	lhs	Input left-hand side matrix
[in]	rhs	Input right-hand side matrix (transposed)
[in]	lhs_offset	LHS matrix offset(input offset). Range: -127 to 128
[in]	num_ch	Number of channels in LHS/RHS
[in]	out_shift	Per channel output shift. Length of vector is equal to number of channels.
[in]	out_mult	Per channel output multiplier. Length of vector is equal to number of channels.
[in]	out_offset	Offset to be added to the output values. Range: -127 to 128
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	row_x_col	(row_dimension * col_dimension) of LHS/RHS matrix
[in]	output_bias	Per channel output bias. Length of vector is equal to number of channels.
[in]	out	Output pointer

Returns

The function returns one of the two

Updated output pointer if an implementation is available
NULL if no implementation is available.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.

Output shift
Output multiplier
Output bias
rhs

Referenced by arm_depthwise_conv_s8_opt().

arm_status arm_nn_mat_mul_core_1x_s8	(	int32_t	row_elements,
		const int8_t *	row_base,
		const int8_t *	col_base,
		int32_t *const	sum_col,
		int32_t *const	output
	)

Parameters

[in]	row_elements	number of row elements
[in]	row_base	pointer to row operand
[in]	col_base	pointer to col operand
[out]	sum_col	pointer to store sum of column elements
[out]	output	pointer to store result of multiply-accumulate

Returns: The function returns the multiply-accumulated result of the row by column.

Pseudo-code *output = 0 sum_col = 0 for (i = 0; i < row_elements; i++) *output += row_base[i] * col_base[i] sum_col += col_base[i]

Referenced by arm_convolve_1_x_n_s8(), and arm_convolve_1x1_s8_fast().

int8_t* arm_nn_mat_mul_core_4x_s8	(	const int32_t	row_elements,
		const int32_t	offset,
		const int8_t *	row_base,
		const int8_t *	col_base,
		const int32_t	out_ch,
		const cmsis_nn_conv_params *	conv_params,
		const cmsis_nn_per_channel_quant_params *	quant_params,
		const int32_t *	bias,
		int8_t *	output
	)

Parameters

[in]	row_elements	number of row elements
[in]	offset	offset between rows. Can be the same as row_elements. For e.g, in a 1x1 conv scenario with stride as 1.
[in]	row_base	pointer to row operand
[in]	col_base	pointer to col operand
[in]	out_ch	Number of output channels
[in]	conv_params	Pointer to convolution parameters like offsets and activation values
[in]	quant_params	Pointer to per-channel quantization parameters
[in]	bias	Pointer to per-channel bias
[out]	output	Pointer to output where int8 results are stored.

Returns: The function returns the updated output pointer or NULL if implementation is not available.

Compliant to TFLM int8 specification. MVE implementation only

References cmsis_nn_conv_params::activation, cmsis_nn_conv_params::input_offset, cmsis_nn_activation::max, cmsis_nn_activation::min, cmsis_nn_per_channel_quant_params::multiplier, cmsis_nn_conv_params::output_offset, and cmsis_nn_per_channel_quant_params::shift.

Referenced by arm_convolve_1_x_n_s8(), arm_convolve_1x1_s8_fast(), and arm_convolve_s8().

arm_status arm_nn_mat_mult_nt_t_s8	(	const q7_t *	lhs,
		const q7_t *	rhs,
		const q31_t *	bias,
		q7_t *	dst,
		const int32_t *	dst_multipliers,
		const int32_t *	dst_shifts,
		const int32_t	lhs_rows,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	lhs_offset,
		const int32_t	dst_offset,
		const int32_t	activation_min,
		const int32_t	activation_max
	)

LHS input matrix NOT transposed (nt)
RHS input matrix transposed (t)

Note: This operation also performs the broadcast bias addition before the requantization

Parameters

[in]	lhs	Pointer to the LHS input matrix
[in]	rhs	Pointer to the RHS input matrix
[in]	bias	Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
[out]	dst	Pointer to the output matrix with "m" rows and "n" columns
[in]	dst_multipliers	Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	dst_shifts	Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	lhs_rows	Number of LHS input rows
[in]	rhs_rows	Number of RHS input rows
[in]	rhs_cols	Number of LHS/RHS input columns
[in]	lhs_offset	Offset to be applied to the LHS input value
[in]	dst_offset	Offset to be applied the output result
[in]	activation_min	Minimum value to clamp down the output. Range : int8
[in]	activation_max	Maximum value to clamp up the output. Range : int8

Returns: The function returns ARM_MATH_SUCCESS

References arm_nn_read_q7x4(), arm_nn_read_q7x4_ia(), arm_nn_requantize(), MAX, and MIN.

Referenced by arm_convolve_1x1_s8_fast().

void arm_nn_mult_q15	(	q15_t *	pSrcA,
		q15_t *	pSrcB,
		q15_t *	pDst,
		const uint16_t	out_shift,
		uint32_t	blockSize
	)

q7 vector multiplication with variable output shifts

Parameters

[in]	*pSrcA	pointer to the first input vector
[in]	*pSrcB	pointer to the second input vector
[out]	*pDst	pointer to the output vector
[in]	out_shift	amount of right-shift for output
[in]	blockSize	number of samples in each vector

Scaling and Overflow Behavior:

: The function uses saturating arithmetic. Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.

References NN_ROUND.

void arm_nn_mult_q7	(	q7_t *	pSrcA,
		q7_t *	pSrcB,
		q7_t *	pDst,
		const uint16_t	out_shift,
		uint32_t	blockSize
	)

q7 vector multiplication with variable output shifts

Parameters

[in]	*pSrcA	pointer to the first input vector
[in]	*pSrcB	pointer to the second input vector
[out]	*pDst	pointer to the output vector
[in]	out_shift	amount of right-shift for output
[in]	blockSize	number of samples in each vector

Scaling and Overflow Behavior:

: The function uses saturating arithmetic. Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.

References NN_ROUND.

arm_status arm_nn_vec_mat_mult_t_s16	(	const q15_t *	lhs,
		const q7_t *	rhs,
		const q63_t *	bias,
		q15_t *	dst,
		const int32_t	dst_multiplier,
		const int32_t	dst_shift,
		const int32_t	rhs_cols,
		const int32_t	rhs_rows,
		const int32_t	activation_min,
		const int32_t	activation_max
	)

Parameters

[in]	lhs	Input left-hand side vector
[in]	rhs	Input right-hand side matrix (transposed)
[in]	bias	Input bias
[out]	dst	Output vector
[in]	dst_multiplier	Output multiplier
[in]	dst_shift	Output shift
[in]	rhs_cols	Number of columns in the right-hand side input matrix
[in]	rhs_rows	Number of rows in the right-hand side input matrix
[in]	activation_min	Minimum value to clamp the output to. Range: int16
[in]	activation_max	Maximum value to clamp the output to. Range: int16

Returns: The function returns ARM_MATH_SUCCESS

References arm_nn_read_q15x2_ia(), arm_nn_requantize_s64(), MAX, and MIN.

Referenced by arm_fully_connected_s16().

arm_status arm_nn_vec_mat_mult_t_s8	(	const q7_t *	lhs,
		const q7_t *	rhs,
		const q31_t *	bias,
		q7_t *	dst,
		const int32_t	lhs_offset,
		const int32_t	rhs_offset,
		const int32_t	dst_offset,
		const int32_t	dst_multiplier,
		const int32_t	dst_shift,
		const int32_t	rhs_cols,
		const int32_t	rhs_rows,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const int32_t	address_offset
	)

Parameters

[in]	lhs	Input left-hand side vector
[in]	rhs	Input right-hand side matrix (transposed)
[in]	bias	Input bias
[out]	dst	Output vector
[in]	lhs_offset	Offset to be added to the input values of the left-hand side vector. Range: -127 to 128
[in]	rhs_offset	Not used
[in]	dst_offset	Offset to be added to the output values. Range: -127 to 128
[in]	dst_multiplier	Output multiplier
[in]	dst_shift	Output shift
[in]	rhs_cols	Number of columns in the right-hand side input matrix
[in]	rhs_rows	Number of rows in the right-hand side input matrix
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	address_offset	Memory position offset for dst. First output is stored at 'dst', the second at 'dst + address_offset' and so on. Default value is typically 1.

Returns: The function returns ARM_MATH_SUCCESS

References arm_nn_read_q7x4_ia(), arm_nn_requantize(), MAX, and MIN.

Referenced by arm_fully_connected_s8(), and arm_svdf_s8().

arm_status arm_nn_vec_mat_mult_t_svdf_s8	(	const q7_t *	lhs,
		const q7_t *	rhs,
		q15_t *	dst,
		const int32_t	lhs_offset,
		const int32_t	rhs_offset,
		const int32_t	scatter_offset,
		const int32_t	dst_multiplier,
		const int32_t	dst_shift,
		const int32_t	rhs_cols,
		const int32_t	rhs_rows,
		const int32_t	activation_min,
		const int32_t	activation_max
	)

Parameters

[in]	lhs	Input left-hand side vector
[in]	rhs	Input right-hand side matrix (transposed)
[out]	dst	Output vector
[in]	lhs_offset	Offset to be added to the input values of the left-hand side vector. Range: -127 to 128
[in]	rhs_offset	Not used
[in]	scatter_offset	Address offset for dst. First output is stored at 'dst', the second at 'dst + scatter_offset' and so on.
[in]	dst_multiplier	Output multiplier
[in]	dst_shift	Output shift
[in]	rhs_cols	Number of columns in the right-hand side input matrix
[in]	rhs_rows	Number of rows in the right-hand side input matrix
[in]	activation_min	Minimum value to clamp the output to. Range: int16
[in]	activation_max	Maximum value to clamp the output to. Range: int16

Returns: The function returns ARM_MATH_SUCCESS

References arm_nn_read_q7x4_ia(), arm_nn_requantize(), MAX, MIN, and NN_Q31_MAX.

Referenced by arm_svdf_state_s16_s8().