Functions
int16_t *	arm_nn_mat_mult_kernel_s16 (const int8_t input_a, const int16_t input_b, const int32_t output_ch, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max, const int32_t num_col_a, const cmsis_nn_bias_data const bias_data, int16_t out_0)
	Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.

arm_cmsis_nn_status	arm_nn_depthwise_conv_nt_t_padded_s8 (const int8_t lhs, const int8_t rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, int8_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.

int16_t *	arm_nn_depthwise_conv_nt_t_s16 (const int16_t lhs, const int8_t rhs, const uint16_t num_ch, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int64_t const output_bias, int16_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.

arm_cmsis_nn_status	arm_nn_depthwise_conv_nt_t_s4 (const int8_t lhs, const int8_t rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, int8_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs consists of packed int4 data. Dimensions are the same for lhs and rhs.

arm_cmsis_nn_status	arm_nn_depthwise_conv_nt_t_s8 (const int8_t lhs, const int8_t rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, int8_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.

arm_cmsis_nn_status	arm_nn_mat_mul_core_1x_s4 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t row_base_ref, const int8_t col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params conv_params, const cmsis_nn_per_channel_quant_params quant_params, const int32_t bias, int8_t output)
	General Vector by Matrix multiplication with requantization, storage of result and int4 weights packed into an int8 buffer.

arm_cmsis_nn_status	arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t row_base_ref, const int8_t col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params conv_params, const cmsis_nn_per_channel_quant_params quant_params, const int32_t bias, int8_t output)
	General Vector by Matrix multiplication with requantization and storage of result.

int8_t *	arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t row_base, const int8_t col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params conv_params, const cmsis_nn_per_channel_quant_params quant_params, const int32_t bias, int8_t output)
	Matrix-multiplication with requantization & activation function for four rows and one column.

arm_cmsis_nn_status	arm_nn_mat_mult_nt_interleaved_t_even_s4 (const int8_t lhs, const int8_t packed_rhs, const int32_t bias, int8_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset)
	General Matrix-multiplication function with per-channel requantization. This function assumes:

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s16 (const int16_t lhs, const int8_t rhs, const cmsis_nn_bias_data bias_data, int16_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t activation_min, const int32_t activation_max)
	General Matrix-multiplication function with per-channel requantization and int16 input (LHS) and output. This function assumes:

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s4 (const int8_t lhs, const int8_t packed_rhs, const int32_t bias, int8_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset)
	General Matrix-multiplication function with per-channel requantization. This function assumes:

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s8 (const int8_t lhs, const int8_t rhs, const int32_t bias, int8_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t row_address_offset, const int32_t lhs_cols_offset)
	General Matrix-multiplication function with per-channel requantization. This function assumes:

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s8_s32 (const int8_t lhs, const int8_t rhs, int32_t *dst, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_idx_offset)
	General Matrix-multiplication function with int8 input and int32 output. This function assumes:

arm_cmsis_nn_status	arm_nn_transpose_conv_row_s8_s32 (const int8_t lhs, const int8_t rhs, int32_t *output_start, const int32_t output_index, const int32_t output_max, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t input_channels, const int32_t output_channels, const int32_t lhs_offset, const int32_t row_offset, const int32_t input_x, const int32_t stride_x, const int32_t skip_rows_top, const int32_t skip_rows_bottom)
	Row of s8 scalars multiplicated with a s8 matrix ad accumulated into a s32 rolling scratch buffer. Helpfunction for transposed convolution.

Description

Support functions for Convolution and DW Convolution

Function Documentation

◆ arm_nn_depthwise_conv_nt_t_padded_s8()

arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8	(	const int8_t *	lhs,
		const int8_t *	rhs,
		const int32_t	lhs_offset,
		const int32_t	active_ch,
		const int32_t	total_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	row_x_col,
		const int32_t *const	output_bias,
		int8_t *	out
	)

Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.

Parameters

[in]	lhs	Input left-hand side matrix
[in]	rhs	Input right-hand side matrix (transposed)
[in]	lhs_offset	LHS matrix offset(input offset). Range: -127 to 128
[in]	active_ch	Subset of total_ch processed
[in]	total_ch	Number of channels in LHS/RHS
[in]	out_shift	Per channel output shift. Length of vector is equal to number of channels
[in]	out_mult	Per channel output multiplier. Length of vector is equal to number of channels
[in]	out_offset	Offset to be added to the output values. Range: -127 to 128
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	row_x_col	(row_dimension * col_dimension) of LHS/RHS matrix
[in]	output_bias	Per channel output bias. Length of vector is equal to number of channels
[in]	out	Output pointer

Returns

The function returns one of the two

Updated output pointer if an implementation is available
NULL if no implementation is available.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.

Output shift
Output multiplier
Output bias
rhs

◆ arm_nn_depthwise_conv_nt_t_s16()

int16_t * arm_nn_depthwise_conv_nt_t_s16	(	const int16_t *	lhs,
		const int8_t *	rhs,
		const uint16_t	num_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	row_x_col,
		const int64_t *const	output_bias,
		int16_t *	out
	)

Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.

Parameters

[in]	lhs	Input left-hand side matrix
[in]	rhs	Input right-hand side matrix (transposed)
[in]	num_ch	Number of channels in LHS/RHS
[in]	out_shift	Per channel output shift. Length of vector is equal to number of channels.
[in]	out_mult	Per channel output multiplier. Length of vector is equal to number of channels.
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	row_x_col	(row_dimension * col_dimension) of LHS/RHS matrix
[in]	output_bias	Per channel output bias. Length of vector is equal to number of channels.
[in]	out	Output pointer

Returns

The function returns one of the two

Updated output pointer if an implementation is available
NULL if no implementation is available.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.

Output shift
Output multiplier
Output bias
rhs

◆ arm_nn_depthwise_conv_nt_t_s4()

arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4	(	const int8_t *	lhs,
		const int8_t *	rhs,
		const int32_t	lhs_offset,
		const int32_t	active_ch,
		const int32_t	total_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	row_x_col,
		const int32_t *const	output_bias,
		int8_t *	out
	)

Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs consists of packed int4 data. Dimensions are the same for lhs and rhs.

Parameters

[in]	lhs	Input left-hand side matrix
[in]	rhs	Input right-hand side matrix (transposed). Consists of int4 data packed in an int8 buffer.
[in]	lhs_offset	LHS matrix offset(input offset). Range: -127 to 128
[in]	active_ch	Subset of total_ch processed
[in]	total_ch	Number of channels in LHS/RHS
[in]	out_shift	Per channel output shift. Length of vector is equal to number of channels.
[in]	out_mult	Per channel output multiplier. Length of vector is equal to number of channels.
[in]	out_offset	Offset to be added to the output values. Range: -127 to 128
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	row_x_col	(row_dimension * col_dimension) of LHS/RHS matrix
[in]	output_bias	Per channel output bias. Length of vector is equal to number of channels.
[in]	out	Output pointer

Returns

The function returns one of the two

Updated output pointer if an implementation is available
NULL if no implementation is available.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.

Output shift
Output multiplier
Output bias
rhs

◆ arm_nn_depthwise_conv_nt_t_s8()

arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8	(	const int8_t *	lhs,
		const int8_t *	rhs,
		const int32_t	lhs_offset,
		const int32_t	active_ch,
		const int32_t	total_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	row_x_col,
		const int32_t *const	output_bias,
		int8_t *	out
	)

Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.

Parameters

[in]	lhs	Input left-hand side matrix
[in]	rhs	Input right-hand side matrix (transposed)
[in]	lhs_offset	LHS matrix offset(input offset). Range: -127 to 128
[in]	active_ch	Subset of total_ch processed
[in]	total_ch	Number of channels in LHS/RHS
[in]	out_shift	Per channel output shift. Length of vector is equal to number of channels.
[in]	out_mult	Per channel output multiplier. Length of vector is equal to number of channels.
[in]	out_offset	Offset to be added to the output values. Range: -127 to 128
[in]	activation_min	Minimum value to clamp the output to. Range: int8
[in]	activation_max	Maximum value to clamp the output to. Range: int8
[in]	row_x_col	(row_dimension * col_dimension) of LHS/RHS matrix
[in]	output_bias	Per channel output bias. Length of vector is equal to number of channels.
[in]	out	Output pointer

Returns

The function returns one of the two

Updated output pointer if an implementation is available
NULL if no implementation is available.

Note

If number of channels is not a multiple of 4, upto 3 elements outside the boundary will be read out for the following.

Output shift
Output multiplier
Output bias
rhs

◆ arm_nn_mat_mul_core_1x_s4()

arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s4	(	int32_t	row_elements,
		const int32_t	skipped_row_elements,
		const int8_t *	row_base_ref,
		const int8_t *	col_base_ref,
		const int32_t	out_ch,
		const cmsis_nn_conv_params *	conv_params,
		const cmsis_nn_per_channel_quant_params *	quant_params,
		const int32_t *	bias,
		int8_t *	output
	)

General Vector by Matrix multiplication with requantization, storage of result and int4 weights packed into an int8 buffer.

Parameters

[in]	row_elements	number of row elements
[in]	skipped_row_elements	number of row elements skipped due to padding. row_elements + skipped_row_elements = (kernel_x * kernel_y) * input_ch
[in]	row_base_ref	pointer to row operand
[in]	col_base_ref	pointer to col operand as packed int4
[out]	out_ch	Number of output channels
[in]	conv_params	Pointer to convolution parameters like offsets and activation values
[in]	quant_params	Pointer to per-channel quantization parameters
[in]	bias	Pointer to optional per-channel bias
[out]	output	Pointer to output where int8 results are stored.

Returns: The function performs matrix(row_base_ref) multiplication with vector(col_base_ref) and scaled result is stored in memory.

Pseudo-code as int8 example. Int4 filter data will be unpacked. *output = 0 sum_col = 0 for (j = 0; j < out_ch; j++) for (i = 0; i < row_elements; i++) *output += row_base_ref[i] * col_base_ref[i] sum_col += col_base_ref[i] scale sum_col using quant_params and bias store result in 'output'

◆ arm_nn_mat_mul_core_1x_s8()

arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8	(	int32_t	row_elements,
		const int32_t	skipped_row_elements,
		const int8_t *	row_base_ref,
		const int8_t *	col_base_ref,
		const int32_t	out_ch,
		const cmsis_nn_conv_params *	conv_params,
		const cmsis_nn_per_channel_quant_params *	quant_params,
		const int32_t *	bias,
		int8_t *	output
	)

General Vector by Matrix multiplication with requantization and storage of result.

Parameters

[in]	row_elements	number of row elements
[in]	skipped_row_elements	number of row elements skipped due to padding. row_elements + skipped_row_elements = (kernel_x * kernel_y) * input_ch
[in]	row_base_ref	pointer to row operand
[in]	col_base_ref	pointer to col operand
[out]	out_ch	Number of output channels
[in]	conv_params	Pointer to convolution parameters like offsets and activation values
[in]	quant_params	Pointer to per-channel quantization parameters
[in]	bias	Pointer to optional per-channel bias
[out]	output	Pointer to output where int8 results are stored.

Returns: The function performs matrix(row_base_ref) multiplication with vector(col_base_ref) and scaled result is stored in memory.

Pseudo-code *output = 0 sum_col = 0 for (j = 0; j < out_ch; j++) for (i = 0; i < row_elements; i++) *output += row_base_ref[i] * col_base_ref[i] sum_col += col_base_ref[i] scale sum_col using quant_params and bias store result in 'output'

◆ arm_nn_mat_mul_core_4x_s8()

int8_t * arm_nn_mat_mul_core_4x_s8	(	const int32_t	row_elements,
		const int32_t	offset,
		const int8_t *	row_base,
		const int8_t *	col_base,
		const int32_t	out_ch,
		const cmsis_nn_conv_params *	conv_params,
		const cmsis_nn_per_channel_quant_params *	quant_params,
		const int32_t *	bias,
		int8_t *	output
	)

Matrix-multiplication with requantization & activation function for four rows and one column.

Parameters

[in]	row_elements	number of row elements
[in]	offset	offset between rows. Can be the same as row_elements. For e.g, in a 1x1 conv scenario with stride as 1.
[in]	row_base	pointer to row operand
[in]	col_base	pointer to col operand
[in]	out_ch	Number of output channels
[in]	conv_params	Pointer to convolution parameters like offsets and activation values
[in]	quant_params	Pointer to per-channel quantization parameters
[in]	bias	Pointer to per-channel bias
[out]	output	Pointer to output where int8 results are stored.

Returns: The function returns the updated output pointer or NULL if implementation is not available.

Compliant to TFLM int8 specification. MVE implementation only

◆ arm_nn_mat_mult_kernel_s16()

int16_t * arm_nn_mat_mult_kernel_s16	(	const int8_t *	input_a,
		const int16_t *	input_b,
		const int32_t	output_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const int32_t	num_col_a,
		const cmsis_nn_bias_data *const	bias_data,
		int16_t *	out_0
	)

Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.

Parameters

[in]	input_a	pointer to operand A
[in]	input_b	pointer to operand B, always consists of 2 vectors.
[in]	output_ch	number of rows of A
[in]	out_shift	pointer to per output channel requantization shift parameter.
[in]	out_mult	pointer to per output channel requantization multiplier parameter.
[in]	activation_min	minimum value to clamp the output to. Range : int16
[in]	activation_max	maximum value to clamp the output to. Range : int16
[in]	num_col_a	number of columns of A
[in]	bias_data	pointer to struct with bias vector. The length of this vector is equal to the number of output columns (or RHS input rows). The vector can be int32 or int64 indicated by a flag in the struct.
[in,out]	out_0	pointer to output

Returns

The function returns one of the two

The incremented output pointer for a successful operation or
NULL if implementation is not available.

This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_nt_interleaved_t_even_s4()

arm_cmsis_nn_status arm_nn_mat_mult_nt_interleaved_t_even_s4	(	const int8_t *	lhs,
		const int8_t *	rhs,
		const int32_t *	bias,
		int8_t *	dst,
		const int32_t *	dst_multipliers,
		const int32_t *	dst_shifts,
		const int32_t	lhs_rows,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	lhs_offset,
		const int32_t	dst_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const int32_t	lhs_cols_offset
	)

General Matrix-multiplication function with per-channel requantization. This function assumes:

LHS input matrix NOT transposed (nt)
RHS input matrix transposed (t)
RHS is int8 packed with 2x int4
LHS is int8
LHS/RHS input columns must be even numbered
LHS must be interleaved. Compare to arm_nn_mat_mult_nt_t_s4 where LHS is not interleaved.

Note: This operation also performs the broadcast bias addition before the requantization

Parameters

[in]	lhs	Pointer to the LHS input matrix
[in]	rhs	Pointer to the RHS input matrix
[in]	bias	Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
[out]	dst	Pointer to the output matrix with "m" rows and "n" columns
[in]	dst_multipliers	Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	dst_shifts	Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	lhs_rows	Number of LHS input rows
[in]	rhs_rows	Number of RHS input rows
[in]	rhs_cols	Number of LHS/RHS input columns. Note this must be even.
[in]	lhs_offset	Offset to be applied to the LHS input value
[in]	dst_offset	Offset to be applied the output result
[in]	activation_min	Minimum value to clamp down the output. Range : int8
[in]	activation_max	Maximum value to clamp up the output. Range : int8
[in]	lhs_cols_offset	Column offset between subsequent lhs_rows

Returns: The function returns ARM_CMSIS_NN_SUCCESS

◆ arm_nn_mat_mult_nt_t_s16()

arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s16	(	const int16_t *	lhs,
		const int8_t *	rhs,
		const cmsis_nn_bias_data *	bias_data,
		int16_t *	dst,
		const int32_t *	dst_multipliers,
		const int32_t *	dst_shifts,
		const int32_t	lhs_rows,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	activation_min,
		const int32_t	activation_max
	)

General Matrix-multiplication function with per-channel requantization and int16 input (LHS) and output. This function assumes:

LHS input matrix NOT transposed (nt)
RHS input matrix transposed (t)

Note: This operation also performs the broadcast bias addition before the requantization

Parameters

[in]	lhs	Pointer to the LHS input matrix
[in]	rhs	Pointer to the RHS input matrix
[in]	bias_data	Pointer to struct with bias vector. The length of this vector is equal to the number of output columns (or RHS input rows). The vector can be int32 or int64 indicated by a flag in the struct.
[out]	dst	Pointer to the output matrix with "m" rows and "n" columns
[in]	dst_multipliers	Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	dst_shifts	Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	lhs_rows	Number of LHS input rows
[in]	rhs_rows	Number of RHS input rows
[in]	rhs_cols	Number of LHS/RHS input columns
[in]	activation_min	Minimum value to clamp down the output. Range : int16
[in]	activation_max	Maximum value to clamp up the output. Range : int16

MVE implementation only.

Returns: The function returns ARM_CMSIS_NN_SUCCESS or ARM_CMSIS_NN_NO_IMPL_ERROR if not for MVE

◆ arm_nn_mat_mult_nt_t_s4()

arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4	(	const int8_t *	lhs,
		const int8_t *	rhs,
		const int32_t *	bias,
		int8_t *	dst,
		const int32_t *	dst_multipliers,
		const int32_t *	dst_shifts,
		const int32_t	lhs_rows,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	lhs_offset,
		const int32_t	dst_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const int32_t	lhs_cols_offset
	)

General Matrix-multiplication function with per-channel requantization. This function assumes:

LHS input matrix NOT transposed (nt)
RHS input matrix transposed (t)
RHS is int8 packed with 2x int4
LHS is int8

Note: This operation also performs the broadcast bias addition before the requantization

Parameters

[in]	lhs	Pointer to the LHS input matrix
[in]	rhs	Pointer to the RHS input matrix
[in]	bias	Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
[out]	dst	Pointer to the output matrix with "m" rows and "n" columns
[in]	dst_multipliers	Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	dst_shifts	Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	lhs_rows	Number of LHS input rows
[in]	rhs_rows	Number of RHS input rows
[in]	rhs_cols	Number of LHS/RHS input columns
[in]	lhs_offset	Offset to be applied to the LHS input value
[in]	dst_offset	Offset to be applied the output result
[in]	activation_min	Minimum value to clamp down the output. Range : int8
[in]	activation_max	Maximum value to clamp up the output. Range : int8
[in]	lhs_cols_offset	Column offset between subsequent lhs_rows

Returns: The function returns ARM_CMSIS_NN_SUCCESS

◆ arm_nn_mat_mult_nt_t_s8()

arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8	(	const int8_t *	lhs,
		const int8_t *	rhs,
		const int32_t *	bias,
		int8_t *	dst,
		const int32_t *	dst_multipliers,
		const int32_t *	dst_shifts,
		const int32_t	lhs_rows,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	lhs_offset,
		const int32_t	dst_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const int32_t	row_address_offset,
		const int32_t	lhs_cols_offset
	)

General Matrix-multiplication function with per-channel requantization. This function assumes:

LHS input matrix NOT transposed (nt)
RHS input matrix transposed (t)

Note: This operation also performs the broadcast bias addition before the requantization

Parameters

[in]	lhs	Pointer to the LHS input matrix
[in]	rhs	Pointer to the RHS input matrix
[in]	bias	Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows)
[out]	dst	Pointer to the output matrix with "m" rows and "n" columns
[in]	dst_multipliers	Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	dst_shifts	Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows)
[in]	lhs_rows	Number of LHS input rows
[in]	rhs_rows	Number of RHS input rows
[in]	rhs_cols	Number of LHS/RHS input columns
[in]	lhs_offset	Offset to be applied to the LHS input value
[in]	dst_offset	Offset to be applied the output result
[in]	activation_min	Minimum value to clamp down the output. Range : int8
[in]	activation_max	Maximum value to clamp up the output. Range : int8
[in]	row_address_offset	Address offset between rows in output. NOTE: Only used for MVEI extension.
[in]	lhs_cols_offset	Column offset between subsequent lhs_rows

Returns: The function returns ARM_CMSIS_NN_SUCCESS

◆ arm_nn_mat_mult_nt_t_s8_s32()

arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8_s32	(	const int8_t *	lhs,
		const int8_t *	rhs,
		int32_t *	dst,
		const int32_t	lhs_rows,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	lhs_offset,
		const int32_t	dst_idx_offset
	)

General Matrix-multiplication function with int8 input and int32 output. This function assumes:

LHS input matrix NOT transposed (nt)
RHS input matrix transposed (t)

Note: Dst/output buffer must be zeroed out before calling this function.

Parameters

[in]	lhs	Pointer to the LHS input matrix
[in]	rhs	Pointer to the RHS input matrix
[out]	dst	Pointer to the output matrix with "m" rows and "n" columns
[in]	lhs_rows	Number of LHS input rows
[in]	rhs_rows	Number of LHS input columns/RHS input rows
[in]	rhs_cols	Number of RHS input columns
[in]	lhs_offset	Offset to be applied to the LHS input value
[in]	dst_idx_offset	Offset between subsequent output results

Returns: The function returns ARM_CMSIS_NN_SUCCESS

◆ arm_nn_transpose_conv_row_s8_s32()

arm_cmsis_nn_status arm_nn_transpose_conv_row_s8_s32	(	const int8_t *	lhs,
		const int8_t *	rhs,
		int32_t *	output_start,
		const int32_t	output_index,
		const int32_t	output_max,
		const int32_t	rhs_rows,
		const int32_t	rhs_cols,
		const int32_t	input_channels,
		const int32_t	output_channels,
		const int32_t	lhs_offset,
		const int32_t	row_offset,
		const int32_t	input_x,
		const int32_t	stride_x,
		const int32_t	skip_row_top,
		const int32_t	skip_row_bottom
	)

Row of s8 scalars multiplicated with a s8 matrix ad accumulated into a s32 rolling scratch buffer. Helpfunction for transposed convolution.

Parameters

[in]	lhs	Input left-hand side scalars
[in]	rhs	Input right-hand side matrix
[out]	output_start	Output buffer start
[in]	output_index	Output buffer current index
[in]	output_max	Output buffer size
[in]	rhs_rows	Number of rows in rhs matrix
[in]	rhs_cols	Number of columns in rhs matrix
[in]	input_channels	Number of input channels
[in]	output_channels	Number of output channels
[in]	lhs_offset	Offset added to lhs before multiplication
[in]	row_offset	Address offset between each row of data output
[in]	input_x	Length of lhs scalar row.
[in]	stride_x	Address offset between each scalar-matrix multiplication result.
[in]	skip_row_top	Skip rows on top of the filter, used for padding.
[in]	skip_row_bottom	Skip rows in the bottom of the filter, used for padding.

Returns: The function returns ARM_CMSIS_NN_SUCCESS

Note: Rolling buffer refers to how the function wraps around the scratch buffer, e.g. it starts writing at [output_start + output_index], writes to [output_start + output_max] and then continues at [output_start] again.

Functions

Description

Function Documentation

◆ arm_nn_depthwise_conv_nt_t_padded_s8()

◆ arm_nn_depthwise_conv_nt_t_s16()

◆ arm_nn_depthwise_conv_nt_t_s4()

◆ arm_nn_depthwise_conv_nt_t_s8()

◆ arm_nn_mat_mul_core_1x_s4()

◆ arm_nn_mat_mul_core_1x_s8()

◆ arm_nn_mat_mul_core_4x_s8()

◆ arm_nn_mat_mult_kernel_s16()

◆ arm_nn_mat_mult_nt_interleaved_t_even_s4()

◆ arm_nn_mat_mult_nt_t_s16()

◆ arm_nn_mat_mult_nt_t_s4()

◆ arm_nn_mat_mult_nt_t_s8()

◆ arm_nn_mat_mult_nt_t_s8_s32()

◆ arm_nn_transpose_conv_row_s8_s32()