Data Structures
union	arm_nnword
	Union for SIMD access of q31/s16/s8 types. More...

struct	arm_nn_double
	Union for data type long long. More...

union	arm_nn_long_long

Macros
#define	USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)

#define	LEFT_SHIFT(_shift) (_shift > 0 ? _shift : 0)

#define	RIGHT_SHIFT(_shift) (_shift > 0 ? 0 : -_shift)

#define	MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0

#define	MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0

#define	SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b))

#define	MAX(A, B) ((A) > (B) ? (A) : (B))

#define	MIN(A, B) ((A) < (B) ? (A) : (B))

#define	CLAMP(x, h, l) MAX(MIN((x), (h)), (l))

#define	REDUCE_MULTIPLIER(_mult) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)

#define	CH_IN_BLOCK_MVE (124)

#define	PACK_S8x4_32x1(v0, v1, v2, v3)
	definition to pack four 8 bit values. More...

#define	PACK_Q15x2_32x1(v0, v1) (((int32_t)v0 & (int32_t)0xFFFF) \| ((int32_t)v1 << 16))
	definition to pack two 16 bit values. More...

#define	NN_ROUND(out_shift) ((0x1 << out_shift) >> 1)
	macro for adding rounding offset More...

#define	MUL_SAT(a, b) arm_nn_doubling_high_mult((a), (b))

#define	MUL_SAT_MVE(a, b) arm_doubling_high_mult_mve_32x4((a), (b))

#define	MUL_POW2(a, b) arm_nn_mult_by_power_of_two((a), (b))

#define	DIV_POW2(a, b) arm_nn_divide_by_power_of_two((a), (b))

#define	DIV_POW2_MVE(a, b) arm_divide_by_power_of_two_mve((a), (b))

#define	EXP_ON_NEG(x) arm_nn_exp_on_negative_values((x))

#define	ONE_OVER1(x) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))

#define	SELECT_IF_NON_ZERO(x)

Functions
void	arm_q7_to_q15_with_offset (const int8_t src, int16_t dst, int32_t block_size, int16_t offset)
	Converts the elements from a s8 vector to a s16 vector with an added offset. More...

int32_t	arm_depthwise_conv_s8_opt_get_buffer_size_dsp (const cmsis_nn_dims input_dims, const cmsis_nn_dims filter_dims)
	Get the required buffer size for optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. This is for processors with DSP extension. Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details. More...

int8_t *	arm_nn_depthwise_conv_s8_core (const int8_t row, const int16_t col, const uint16_t num_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t kernel_size, const int32_t const output_bias, int8_t out)
	Depthwise conv on an im2col buffer where the input channel equals output channel. More...

int8_t *	arm_nn_mat_mult_s8 (const int8_t input_row, const int8_t input_col, const uint16_t output_ch, const uint16_t col_batches, const int32_t output_shift, const int32_t output_mult, const int32_t out_offset, const int32_t col_offset, const int32_t row_offset, const int16_t out_activation_min, const int16_t out_activation_max, const uint16_t row_len, const int32_t const bias, int8_t out)
	General Matrix-multiplication function with per-channel requantization. More...

int16_t *	arm_nn_mat_mult_kernel_s16 (const int8_t input_a, const int16_t input_b, const int32_t output_ch, const int32_t out_shift, const int32_t out_mult, const int16_t activation_min, const int16_t activation_max, const int32_t num_col_a, const int64_t const output_bias, int16_t out_0)
	Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution. More...

arm_cmsis_nn_status	arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t row_base_ref, const int8_t col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params conv_params, const cmsis_nn_per_channel_quant_params quant_params, const int32_t bias, int8_t output)
	General Vector by Matrix multiplication with requantization and storage of result. More...

int8_t *	arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t row_base, const int8_t col_base, const int32_t out_ch, const cmsis_nn_conv_params conv_params, const cmsis_nn_per_channel_quant_params quant_params, const int32_t bias, int8_t output)
	Matrix-multiplication with requantization & activation function for four rows and one column. More...

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s4 (const int8_t lhs, const int8_t rhs, const int32_t bias, int8_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset)
	General Matrix-multiplication function with per-channel requantization. This function assumes: More...

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s8 (const int8_t lhs, const int8_t rhs, const int32_t bias, int8_t dst, const int32_t dst_multipliers, const int32_t dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset)
	General Matrix-multiplication function with per-channel requantization. This function assumes: More...

arm_cmsis_nn_status	arm_nn_mat_mult_nt_t_s8_s32 (const int8_t lhs, const int8_t rhs, int32_t *dst, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_idx_offset)
	General Matrix-multiplication function with int8 input and int32 output. This function assumes: More...

arm_cmsis_nn_status	arm_nn_vec_mat_mult_t_s4 (const int8_t lhs, const int8_t packed_rhs, const int32_t bias, int8_t dst, const int32_t lhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, const int32_t address_offset)
	s4 Vector by Matrix (transposed) multiplication More...

arm_cmsis_nn_status	arm_nn_vec_mat_mult_t_s8 (const int8_t lhs, const int8_t rhs, const int32_t kernel_sum, const int32_t bias, int8_t *dst, const int32_t lhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, const int32_t address_offset)
	s8 Vector by Matrix (transposed) multiplication More...

arm_cmsis_nn_status	arm_nn_vec_mat_mult_t_s16 (const int16_t lhs, const int8_t rhs, const int64_t bias, int16_t dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
	s16 Vector by Matrix (transposed) multiplication More...

arm_cmsis_nn_status	arm_nn_vec_mat_mult_t_svdf_s8 (const int8_t lhs, const int8_t rhs, int16_t *dst, const int32_t lhs_offset, const int32_t scatter_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
	s8 Vector by Matrix (transposed) multiplication with s16 output More...

arm_cmsis_nn_status	arm_nn_depthwise_conv_nt_t_padded_s8 (const int8_t lhs, const int8_t rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, int8_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. More...

arm_cmsis_nn_status	arm_nn_depthwise_conv_nt_t_s8 (const int8_t lhs, const int8_t rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t const output_bias, int8_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More...

int16_t *	arm_nn_depthwise_conv_nt_t_s16 (const int16_t lhs, const int8_t rhs, const uint16_t num_ch, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int64_t const output_bias, int16_t out)
	Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More...

int8_t *	arm_nn_mat_mult_kernel_s4_s16 (const int8_t input_a, const int16_t input_b, const uint16_t output_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const int32_t num_col_a, const int32_t const output_bias, int8_t out_0)
	Matrix-multiplication function for convolution with per-channel requantization and 4 bit weights. More...

int8_t *	arm_nn_mat_mult_kernel_s8_s16 (const int8_t input_a, const int16_t input_b, const uint16_t output_ch, const int32_t out_shift, const int32_t out_mult, const int32_t out_offset, const int16_t activation_min, const int16_t activation_max, const int32_t num_col_a, const int32_t aligned_num_col_a, const int32_t const output_bias, int8_t out_0)
	Matrix-multiplication function for convolution with per-channel requantization. More...

void	arm_nn_softmax_common_s8 (const int8_t input, const int32_t num_rows, const int32_t row_size, const int32_t mult, const int32_t shift, const int32_t diff_min, const bool int16_output, void output)
	Common softmax function for s8 input and s8 or s16 output. More...

arm_cmsis_nn_status	arm_nn_lstm_step_s8_s16 (const int8_t input, const int8_t input_to_input_weight, const int8_t input_to_forget_weight, const int8_t input_to_cell_weight, const int8_t input_to_output_weight, const int8_t recurrent_to_input_weight, const int8_t recurrent_to_forget_weight, const int8_t recurrent_to_cell_weight, const int8_t recurrent_to_output_weight, const cmsis_nn_lstm_params lstm, const int n_batch, const int n_cell, const int n_input, const int n_output, int8_t output_state, int16_t cell_state, int8_t output, cmsis_nn_lstm_context scratch_buffers)
	Update LSTM function for an iteration step. More...

void	arm_nn_lstm_calculate_gate_s8_s16 (const int8_t input, const int8_t input_to_gate_weights, const int32_t input_to_gate_bias, const cmsis_nn_scaling input_to_gate_scaling, const int8_t output_state, const int8_t recurrent_to_gate_weights, const int32_t recurrent_to_gate_bias, const cmsis_nn_scaling recurrent_to_gate_scaling, const int32_t n_batch, const int32_t n_input, const int32_t n_output, const int32_t n_cell, const arm_nn_activation_type activation_type, int16_t *gate)
	Updates a LSTM gate for an iteration step of LSTM function, int8x8_16 version. More...

void	arm_nn_lstm_update_cell_state_s16 (const int32_t n_block, const int32_t cell_state_scale, int16_t cell_state, const int16_t input_gate, const int16_t forget_gate, const int16_t cell_gate)
	Update cell state for a single LSTM iteration step, int8x8_16 version. More...

void	arm_nn_lstm_update_output_s8_s16 (const int n_batch, const int n_cell, int16_t cell_state, const int32_t cell_state_scale, const int16_t output_gate, const cmsis_nn_scaling hidden_scale, const int32_t hidden_offset, int8_t output_state, int16_t cell_gate_scratch)
	Calculate the output state tensor of an LSTM step, s8 input/output and s16 weight version. More...

void	arm_nn_vec_mat_mul_result_acc_s8 (const int8_t lhs_in, const int8_t rhs_in, const int32_t bias, int16_t dst, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t batch)
	The result of the multiplication is accumulated to the passed result buffer. Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent from each other). More...

arm_cmsis_nn_status	arm_elementwise_mul_s16_s8 (const int16_t input_1_vect, const int16_t input_2_vect, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t block_size)
	s16 elementwise multiplication with s8 output More...

Macro Definition Documentation

◆ CH_IN_BLOCK_MVE

#define CH_IN_BLOCK_MVE (124)

◆ CLAMP

#define CLAMP	(	x,
		h,
		l
	)	MAX(MIN((x), (h)), (l))

◆ DIV_POW2

#define DIV_POW2	(	a,
		b
	)	arm_nn_divide_by_power_of_two((a), (b))

◆ DIV_POW2_MVE

#define DIV_POW2_MVE	(	a,
		b
	)	arm_divide_by_power_of_two_mve((a), (b))

◆ EXP_ON_NEG

#define EXP_ON_NEG ( x ) arm_nn_exp_on_negative_values((x))

◆ LEFT_SHIFT

#define LEFT_SHIFT ( _shift ) (_shift > 0 ? _shift : 0)

◆ MASK_IF_NON_ZERO

#define MASK_IF_NON_ZERO ( x ) (x) != 0 ? ~0 : 0

◆ MASK_IF_ZERO

#define MASK_IF_ZERO ( x ) (x) == 0 ? ~0 : 0

◆ MAX

#define MAX	(	A,
		B
	)	((A) > (B) ? (A) : (B))

◆ MIN

#define MIN	(	A,
		B
	)	((A) < (B) ? (A) : (B))

◆ MUL_POW2

#define MUL_POW2	(	a,
		b
	)	arm_nn_mult_by_power_of_two((a), (b))

◆ MUL_SAT

#define MUL_SAT	(	a,
		b
	)	arm_nn_doubling_high_mult((a), (b))

◆ MUL_SAT_MVE

#define MUL_SAT_MVE	(	a,
		b
	)	arm_doubling_high_mult_mve_32x4((a), (b))

◆ NN_ROUND

#define NN_ROUND ( out_shift ) ((0x1 << out_shift) >> 1)

◆ ONE_OVER1

#define ONE_OVER1 ( x ) arm_nn_one_over_one_plus_x_for_x_in_0_1((x))

◆ PACK_Q15x2_32x1

#define PACK_Q15x2_32x1	(	v0,
		v1
	)	(((int32_t)v0 & (int32_t)0xFFFF) \| ((int32_t)v1 << 16))

◆ PACK_S8x4_32x1

#define PACK_S8x4_32x1	(	v0,
		v1,
		v2,
		v3
	)

Value:

((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \

(((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))

◆ REDUCE_MULTIPLIER

#define REDUCE_MULTIPLIER ( _mult ) ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)

◆ RIGHT_SHIFT

#define RIGHT_SHIFT ( _shift ) (_shift > 0 ? 0 : -_shift)

◆ SELECT_IF_NON_ZERO

#define SELECT_IF_NON_ZERO ( x )

Value:

    {                                                                                                                  \
        mask = MASK_IF_NON_ZERO(remainder & (1 << shift++));                                                           \
        result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result);                                                  \
    }

◆ SELECT_USING_MASK

#define SELECT_USING_MASK	(	mask,
		a,
		b
	)	((mask) & (a)) ^ (~(mask) & (b))

◆ USE_FAST_DW_CONV_S16_FUNCTION

#define USE_FAST_DW_CONV_S16_FUNCTION	(	dw_conv_params,
		filter_dims,
		input_dims
	)

Value:

(dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 && \

filter_dims->w * filter_dims->h < 512)

Function Documentation

◆ arm_nn_depthwise_conv_s8_core()

int8_t * arm_nn_depthwise_conv_s8_core	(	const int8_t *	row,
		const int16_t *	col,
		const uint16_t	num_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const uint16_t	kernel_size,
		const int32_t *const	output_bias,
		int8_t *	out
	)

Parameters

[in]	row	pointer to row
[in]	col	pointer to im2col buffer, always consists of 2 columns.
[in]	num_ch	number of channels
[in]	out_shift	pointer to per output channel requantization shift parameter.
[in]	out_mult	pointer to per output channel requantization multiplier parameter.
[in]	out_offset	output tensor offset.
[in]	activation_min	minimum value to clamp the output to. Range : int8
[in]	activation_max	maximum value to clamp the output to. Range : int8
[in]	kernel_size	number of elements in one column.
[in]	output_bias	per output channel bias. Range : int32
[out]	out	pointer to output

Returns

The function returns one of the two

The incremented output pointer for a successful operation or
NULL if implementation is not available.

Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_kernel_s4_s16()

int8_t * arm_nn_mat_mult_kernel_s4_s16	(	const int8_t *	input_a,
		const int16_t *	input_b,
		const uint16_t	output_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int32_t	activation_min,
		const int32_t	activation_max,
		const int32_t	num_col_a,
		const int32_t *const	output_bias,
		int8_t *	out_0
	)

Parameters

[in]	input_a	pointer to operand A, int8 packed with 2x int4.
[in]	input_b	pointer to operand B, always consists of 2 vectors.
[in]	output_ch	number of rows of A
[in]	out_shift	pointer to per output channel requantization shift parameter.
[in]	out_mult	pointer to per output channel requantization multiplier parameter.
[in]	out_offset	output tensor offset.
[in]	activation_min	minimum value to clamp the output to. Range : int8
[in]	activation_max	maximum value to clamp the output to. Range : int8
[in]	num_col_a	number of columns of A
[in]	output_bias	per output channel bias. Range : int32
[in,out]	out_0	pointer to output

Returns

The function returns one of the two

The incremented output pointer for a successful operation or
NULL if implementation is not available.

This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_kernel_s8_s16()

int8_t * arm_nn_mat_mult_kernel_s8_s16	(	const int8_t *	input_a,
		const int16_t *	input_b,
		const uint16_t	output_ch,
		const int32_t *	out_shift,
		const int32_t *	out_mult,
		const int32_t	out_offset,
		const int16_t	activation_min,
		const int16_t	activation_max,
		const int32_t	num_col_a,
		const int32_t	aligned_num_col_a,
		const int32_t *const	output_bias,
		int8_t *	out_0
	)

Parameters

[in]	input_a	pointer to operand A
[in]	input_b	pointer to operand B, always consists of 2 vectors.
[in]	output_ch	number of rows of A
[in]	out_shift	pointer to per output channel requantization shift parameter.
[in]	out_mult	pointer to per output channel requantization multiplier parameter.
[in]	out_offset	output tensor offset.
[in]	activation_min	minimum value to clamp the output to. Range : int8
[in]	activation_max	maximum value to clamp the output to. Range : int8
[in]	num_col_a	number of columns of A
[in]	aligned_num_col_a	number of columns of A aligned by 4
[in]	output_bias	per output channel bias. Range : int32
[in,out]	out_0	pointer to output

Returns

The function returns one of the two

The incremented output pointer for a successful operation or
NULL if implementation is not available.

This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_s8()

int8_t * arm_nn_mat_mult_s8	(	const int8_t *	input_row,
		const int8_t *	input_col,
		const uint16_t	output_ch,
		const uint16_t	col_batches,
		const int32_t *	output_shift,
		const int32_t *	output_mult,
		const int32_t	out_offset,
		const int32_t	col_offset,
		const int32_t	row_offset,
		const int16_t	out_activation_min,
		const int16_t	out_activation_max,
		const uint16_t	row_len,
		const int32_t *const	bias,
		int8_t *	out
	)

Parameters

[in]	input_row	pointer to row operand
[in]	input_col	pointer to col operand
[in]	output_ch	number of rows of input_row
[in]	col_batches	number of column batches. Range: 1 to 4
[in]	output_shift	pointer to per output channel requantization shift parameter.
[in]	output_mult	pointer to per output channel requantization multiplier parameter.
[in]	out_offset	output tensor offset.
[in]	col_offset	input tensor(col) offset.
[in]	row_offset	kernel offset(row). Not used.
[in]	out_activation_min	minimum value to clamp the output to. Range : int8
[in]	out_activation_max	maximum value to clamp the output to. Range : int8
[in]	row_len	number of elements in each row
[in]	bias	per output channel bias. Range : int32
[in,out]	out	pointer to output

Returns

The function returns one of the two

The incremented output pointer for a successful operation or
NULL if implementation is not available.

Supported framework: TensorFlow Lite

Data Structures

Macros

Functions

Macro Definition Documentation

◆ CH_IN_BLOCK_MVE

◆ CLAMP

◆ DIV_POW2

◆ DIV_POW2_MVE

◆ EXP_ON_NEG

◆ LEFT_SHIFT

◆ MASK_IF_NON_ZERO

◆ MASK_IF_ZERO

◆ MAX

◆ MIN

◆ MUL_POW2

◆ MUL_SAT

◆ MUL_SAT_MVE

◆ NN_ROUND

◆ ONE_OVER1

◆ PACK_Q15x2_32x1

◆ PACK_S8x4_32x1

◆ REDUCE_MULTIPLIER

◆ RIGHT_SHIFT

◆ SELECT_IF_NON_ZERO

◆ SELECT_USING_MASK

◆ USE_FAST_DW_CONV_S16_FUNCTION

Function Documentation

◆ arm_nn_depthwise_conv_s8_core()

◆ arm_nn_mat_mult_kernel_s4_s16()

◆ arm_nn_mat_mult_kernel_s8_s16()

◆ arm_nn_mat_mult_s8()