CMSIS-NN  
CMSIS NN Software Library
 
Loading...
Searching...
No Matches
arm_nnsupportfunctions.h File Reference

Data Structures

union  arm_nnword
 Union for SIMD access of q31/s16/s8 types. More...
 
struct  arm_nn_double
 Union for data type long long. More...
 
union  arm_nn_long_long
 

Macros

#define USE_FAST_DW_CONV_S16_FUNCTION(dw_conv_params, filter_dims, input_dims)
 
#define LEFT_SHIFT(_shift)   (_shift > 0 ? _shift : 0)
 
#define RIGHT_SHIFT(_shift)   (_shift > 0 ? 0 : -_shift)
 
#define MASK_IF_ZERO(x)   (x) == 0 ? ~0 : 0
 
#define MASK_IF_NON_ZERO(x)   (x) != 0 ? ~0 : 0
 
#define SELECT_USING_MASK(mask, a, b)   ((mask) & (a)) ^ (~(mask) & (b))
 
#define MAX(A, B)   ((A) > (B) ? (A) : (B))
 
#define MIN(A, B)   ((A) < (B) ? (A) : (B))
 
#define CLAMP(x, h, l)   MAX(MIN((x), (h)), (l))
 
#define REDUCE_MULTIPLIER(_mult)   ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)
 
#define CH_IN_BLOCK_MVE   (124)
 
#define S4_CH_IN_BLOCK_MVE   (124)
 
#define MAX_COL_COUNT   (512)
 
#define REVERSE_TCOL_EFFICIENT_THRESHOLD   (16)
 
#define CONVERT_DW_CONV_WITH_ONE_INPUT_CH_AND_OUTPUT_CH_ABOVE_THRESHOLD   (1)
 
#define OPTIONAL_RESTRICT_KEYWORD
 
#define PACK_S8x4_32x1(v0, v1, v2, v3)
 definition to pack four 8 bit values.
 
#define PACK_Q15x2_32x1(v0, v1)   (((int32_t)v0 & (int32_t)0xFFFF) | ((int32_t)v1 << 16))
 definition to pack two 16 bit values.
 
#define NN_ROUND(out_shift)   ((0x1 << out_shift) >> 1)
 macro for adding rounding offset
 
#define MUL_SAT(a, b)   arm_nn_doubling_high_mult((a), (b))
 
#define MUL_SAT_MVE(a, b)   arm_doubling_high_mult_mve_32x4((a), (b))
 
#define MUL_POW2(a, b)   arm_nn_mult_by_power_of_two((a), (b))
 
#define DIV_POW2(a, b)   arm_nn_divide_by_power_of_two((a), (b))
 
#define DIV_POW2_MVE(a, b)   arm_divide_by_power_of_two_mve((a), (b))
 
#define EXP_ON_NEG(x)   arm_nn_exp_on_negative_values((x))
 
#define ONE_OVER1(x)   arm_nn_one_over_one_plus_x_for_x_in_0_1((x))
 
#define SELECT_IF_NON_ZERO(x)
 

Functions

void arm_q7_to_q15_with_offset (const int8_t *src, int16_t *dst, int32_t block_size, int16_t offset)
 Converts the elements from a s8 vector to a s16 vector with an added offset.
 
int32_t arm_depthwise_conv_s8_opt_get_buffer_size_mve (const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 Get the required buffer size for optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. This is for processors with MVE extension. Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details.
 
int32_t arm_depthwise_conv_s8_opt_get_buffer_size_dsp (const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
 Get the required buffer size for optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. This is for processors with DSP extension. Refer to arm_depthwise_conv_s8_opt_get_buffer_size() for function argument details.
 
int8_t * arm_nn_depthwise_conv_s8_core (const int8_t *row, const int16_t *col, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t kernel_size, const int32_t *const output_bias, int8_t *out)
 Depthwise conv on an im2col buffer where the input channel equals output channel.
 
int8_t * arm_nn_mat_mult_s8 (const int8_t *input_row, const int8_t *input_col, const uint16_t output_ch, const uint16_t col_batches, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t col_offset, const int32_t row_offset, const int16_t out_activation_min, const int16_t out_activation_max, const uint16_t row_len, const int32_t *const bias, int8_t *out)
 General Matrix-multiplication function with per-channel requantization.
 
int16_t * arm_nn_mat_mult_kernel_s16 (const int8_t *input_a, const int16_t *input_b, const int32_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, const int32_t num_col_a, const cmsis_nn_bias_data *const bias_data, int16_t *out_0)
 Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
 
arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t *row_base_ref, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output)
 General Vector by Matrix multiplication with requantization and storage of result.
 
arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s4 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t *row_base_ref, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output)
 General Vector by Matrix multiplication with requantization, storage of result and int4 weights packed into an int8 buffer.
 
int8_t * arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output)
 Matrix-multiplication with requantization & activation function for four rows and one column.
 
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4 (const int8_t *lhs, const int8_t *rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset)
 General Matrix-multiplication function with per-channel requantization. This function assumes:
 
arm_cmsis_nn_status arm_nn_mat_mult_nt_interleaved_t_even_s4 (const int8_t *lhs, const int8_t *rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset)
 General Matrix-multiplication function with per-channel requantization. This function assumes:
 
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t row_address_offset, const int32_t lhs_cols_offset)
 General Matrix-multiplication function with per-channel requantization. This function assumes:
 
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s16 (const int16_t *lhs, const int8_t *rhs, const cmsis_nn_bias_data *bias_data, int16_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t activation_min, const int32_t activation_max)
 General Matrix-multiplication function with per-channel requantization and int16 input (LHS) and output. This function assumes:
 
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8_s32 (const int8_t *lhs, const int8_t *rhs, int32_t *dst, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_idx_offset)
 General Matrix-multiplication function with int8 input and int32 output. This function assumes:
 
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s4 (const int8_t *lhs, const int8_t *packed_rhs, const int32_t *bias, int8_t *dst, const int32_t lhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s4 Vector by Matrix (transposed) multiplication
 
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t *kernel_sum, const int32_t *bias, int8_t *dst, const int32_t lhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, const int32_t address_offset, const int32_t rhs_offset)
 s8 Vector by Matrix (transposed) multiplication
 
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_per_ch_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t *kernel_sum, const int32_t *bias, int8_t *dst, const int32_t lhs_offset, const int32_t dst_offset, const int32_t *dst_multiplier, const int32_t *dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, const int32_t address_offset, const int32_t rhs_offset)
 s8 Vector by Matrix (transposed) multiplication using per channel quantization for output
 
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16 (const int16_t *lhs, const int8_t *rhs, const int64_t *bias, int16_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s16 Vector by s8 Matrix (transposed) multiplication
 
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s16_s16 (const int16_t *lhs, const int16_t *rhs, const int64_t *bias, int16_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s16 Vector by s16 Matrix (transposed) multiplication
 
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_svdf_s8 (const int8_t *lhs, const int8_t *rhs, int16_t *dst, const int32_t lhs_offset, const int32_t scatter_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max)
 s8 Vector by Matrix (transposed) multiplication with s16 output
 
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.
 
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.
 
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4 (const int8_t *lhs, const int8_t *rhs, const int32_t lhs_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs consists of packed int4 data. Dimensions are the same for lhs and rhs.
 
int16_t * arm_nn_depthwise_conv_nt_t_s16 (const int16_t *lhs, const int8_t *rhs, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int64_t *const output_bias, int16_t *out)
 Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.
 
arm_cmsis_nn_status arm_nn_transpose_conv_row_s8_s32 (const int8_t *lhs, const int8_t *rhs, int32_t *output_start, const int32_t output_index, const int32_t output_max, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t input_channels, const int32_t output_channels, const int32_t lhs_offset, const int32_t row_offset, const int32_t input_x, const int32_t stride_x, const int32_t skip_row_top, const int32_t skip_row_bottom)
 Row of s8 scalars multiplicated with a s8 matrix ad accumulated into a s32 rolling scratch buffer. Helpfunction for transposed convolution.
 
int8_t * arm_nn_mat_mult_kernel_s4_s16 (const int8_t *input_a, const int16_t *input_b, const uint16_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const int32_t num_col_a, const int32_t *const output_bias, int8_t *out_0)
 Matrix-multiplication function for convolution with per-channel requantization and 4 bit weights.
 
int8_t * arm_nn_mat_mult_kernel_s8_s16 (const int8_t *input_a, const int16_t *input_b, const uint16_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int16_t activation_min, const int16_t activation_max, const int32_t num_col_a, const int32_t aligned_num_col_a, const int32_t *const output_bias, int8_t *out_0)
 Matrix-multiplication function for convolution with per-channel requantization.
 
int8_t * arm_nn_mat_mult_kernel_row_offset_s8_s16 (const int8_t *input_a, const int16_t *input_b, const uint16_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int16_t activation_min, const int16_t activation_max, const int32_t num_col_a, const int32_t aligned_num_col_a, const int32_t *const output_bias, const int32_t row_address_offset, int8_t *out_0)
 Matrix-multiplication function for convolution with per-channel requantization, supporting an address offset between rows.
 
void arm_nn_softmax_common_s8 (const int8_t *input, const int32_t num_rows, const int32_t row_size, const int32_t mult, const int32_t shift, const int32_t diff_min, const bool int16_output, void *output)
 Common softmax function for s8 input and s8 or s16 output.
 
arm_cmsis_nn_status arm_nn_lstm_step_s8 (const int8_t *data_in, const int8_t *hidden_in, int8_t *hidden_out, const cmsis_nn_lstm_params *params, cmsis_nn_lstm_context *buffers, const int32_t batch_offset)
 Update LSTM function for an iteration step using s8 input and output, and s16 internally.
 
arm_cmsis_nn_status arm_nn_lstm_step_s16 (const int16_t *data_in, const int16_t *hidden_in, int16_t *hidden_out, const cmsis_nn_lstm_params *params, cmsis_nn_lstm_context *buffers, const int32_t batch_offset)
 Update LSTM function for an iteration step using s16 input and output, and s16 internally.
 
arm_cmsis_nn_status arm_nn_lstm_calculate_gate_s8_s16 (const int8_t *data_in, const int8_t *hidden_in, const cmsis_nn_lstm_gate *gate_data, const cmsis_nn_lstm_params *params, int16_t *output, const int32_t batch_offset)
 Updates a LSTM gate for an iteration step of LSTM function, int8x8_16 version.
 
arm_cmsis_nn_status arm_nn_lstm_calculate_gate_s16 (const int16_t *data_in, const int16_t *hidden_in, const cmsis_nn_lstm_gate *gate_data, const cmsis_nn_lstm_params *params, int16_t *output, const int32_t batch_offset)
 Updates a LSTM gate for an iteration step of LSTM function, int16x8_16 version.
 
arm_cmsis_nn_status arm_nn_vec_mat_mul_result_acc_s8_s16 (const int8_t *lhs, const int8_t *rhs, const int32_t *effective_bias, int16_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t batches, const int32_t batch_offset)
 The result of the multiplication is accumulated to the passed result buffer. Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent from each other).
 
arm_cmsis_nn_status arm_nn_vec_mat_mul_result_acc_s16 (const int16_t *lhs, const int8_t *rhs, const int64_t *effective_bias, int16_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t batches, const int32_t batch_offset)
 The result of the multiplication is accumulated to the passed result buffer. Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch dimension composed by input vectors independent from each other).
 
arm_cmsis_nn_status arm_elementwise_mul_s16_s8 (const int16_t *input_1_vect, const int16_t *input_2_vect, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t block_size, const int32_t batch_size, const int32_t batch_offset)
 s16 elementwise multiplication with s8 output
 
arm_cmsis_nn_status arm_elementwise_mul_s16_batch_offset (const int16_t *input_1_vect, const int16_t *input_2_vect, int16_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t block_size, const int32_t batch_size, const int32_t batch_offset)
 s16 elementwise multiplication with s16 output
 
arm_cmsis_nn_status arm_elementwise_mul_acc_s16 (const int16_t *input_1_vect, const int16_t *input_2_vect, const int32_t input_1_offset, const int32_t input_2_offset, int16_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t out_activation_min, const int32_t out_activation_max, const int32_t block_size)
 s16 elementwise multiplication. The result of the multiplication is accumulated to the passed result buffer.
 

Macro Definition Documentation

◆ CH_IN_BLOCK_MVE

#define CH_IN_BLOCK_MVE   (124)

◆ CLAMP

#define CLAMP (   x,
  h,
 
)    MAX(MIN((x), (h)), (l))

◆ CONVERT_DW_CONV_WITH_ONE_INPUT_CH_AND_OUTPUT_CH_ABOVE_THRESHOLD

#define CONVERT_DW_CONV_WITH_ONE_INPUT_CH_AND_OUTPUT_CH_ABOVE_THRESHOLD   (1)

◆ DIV_POW2

#define DIV_POW2 (   a,
 
)    arm_nn_divide_by_power_of_two((a), (b))

◆ DIV_POW2_MVE

#define DIV_POW2_MVE (   a,
 
)    arm_divide_by_power_of_two_mve((a), (b))

◆ EXP_ON_NEG

#define EXP_ON_NEG (   x)    arm_nn_exp_on_negative_values((x))

◆ LEFT_SHIFT

#define LEFT_SHIFT (   _shift)    (_shift > 0 ? _shift : 0)

◆ MASK_IF_NON_ZERO

#define MASK_IF_NON_ZERO (   x)    (x) != 0 ? ~0 : 0

◆ MASK_IF_ZERO

#define MASK_IF_ZERO (   x)    (x) == 0 ? ~0 : 0

◆ MAX

#define MAX (   A,
 
)    ((A) > (B) ? (A) : (B))

◆ MAX_COL_COUNT

#define MAX_COL_COUNT   (512)

◆ MIN

#define MIN (   A,
 
)    ((A) < (B) ? (A) : (B))

◆ MUL_POW2

#define MUL_POW2 (   a,
 
)    arm_nn_mult_by_power_of_two((a), (b))

◆ MUL_SAT

#define MUL_SAT (   a,
 
)    arm_nn_doubling_high_mult((a), (b))

◆ MUL_SAT_MVE

#define MUL_SAT_MVE (   a,
 
)    arm_doubling_high_mult_mve_32x4((a), (b))

◆ NN_ROUND

#define NN_ROUND (   out_shift)    ((0x1 << out_shift) >> 1)

macro for adding rounding offset

◆ ONE_OVER1

#define ONE_OVER1 (   x)    arm_nn_one_over_one_plus_x_for_x_in_0_1((x))

◆ OPTIONAL_RESTRICT_KEYWORD

#define OPTIONAL_RESTRICT_KEYWORD

◆ PACK_Q15x2_32x1

#define PACK_Q15x2_32x1 (   v0,
  v1 
)    (((int32_t)v0 & (int32_t)0xFFFF) | ((int32_t)v1 << 16))

definition to pack two 16 bit values.

◆ PACK_S8x4_32x1

#define PACK_S8x4_32x1 (   v0,
  v1,
  v2,
  v3 
)
Value:
((((int32_t)(v0) << 0) & (int32_t)0x000000FF) | (((int32_t)(v1) << 8) & (int32_t)0x0000FF00) | \
(((int32_t)(v2) << 16) & (int32_t)0x00FF0000) | (((int32_t)(v3) << 24) & (int32_t)0xFF000000))

definition to pack four 8 bit values.

◆ REDUCE_MULTIPLIER

#define REDUCE_MULTIPLIER (   _mult)    ((_mult < 0x7FFF0000) ? ((_mult + (1 << 15)) >> 16) : 0x7FFF)

◆ REVERSE_TCOL_EFFICIENT_THRESHOLD

#define REVERSE_TCOL_EFFICIENT_THRESHOLD   (16)

◆ RIGHT_SHIFT

#define RIGHT_SHIFT (   _shift)    (_shift > 0 ? 0 : -_shift)

◆ S4_CH_IN_BLOCK_MVE

#define S4_CH_IN_BLOCK_MVE   (124)

◆ SELECT_IF_NON_ZERO

#define SELECT_IF_NON_ZERO (   x)
Value:
{ \
mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \
result = SELECT_USING_MASK(mask, MUL_SAT(result, x), result); \
}

◆ SELECT_USING_MASK

#define SELECT_USING_MASK (   mask,
  a,
 
)    ((mask) & (a)) ^ (~(mask) & (b))

◆ USE_FAST_DW_CONV_S16_FUNCTION

#define USE_FAST_DW_CONV_S16_FUNCTION (   dw_conv_params,
  filter_dims,
  input_dims 
)
Value:
(dw_conv_params->ch_mult == 1 && dw_conv_params->dilation.w == 1 && dw_conv_params->dilation.h == 1 && \
filter_dims->w * filter_dims->h < 512)

Function Documentation

◆ arm_nn_depthwise_conv_s8_core()

int8_t * arm_nn_depthwise_conv_s8_core ( const int8_t *  row,
const int16_t *  col,
const uint16_t  num_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int32_t  activation_min,
const int32_t  activation_max,
const uint16_t  kernel_size,
const int32_t *const  output_bias,
int8_t *  out 
)

Depthwise conv on an im2col buffer where the input channel equals output channel.

Parameters
[in]rowpointer to row
[in]colpointer to im2col buffer, always consists of 2 columns.
[in]num_chnumber of channels
[in]out_shiftpointer to per output channel requantization shift parameter.
[in]out_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]activation_minminimum value to clamp the output to. Range : int8
[in]activation_maxmaximum value to clamp the output to. Range : int8
[in]kernel_sizenumber of elements in one column.
[in]output_biasper output channel bias. Range : int32
[out]outpointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

    Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_kernel_row_offset_s8_s16()

int8_t * arm_nn_mat_mult_kernel_row_offset_s8_s16 ( const int8_t *  input_a,
const int16_t *  input_b,
const uint16_t  output_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int16_t  activation_min,
const int16_t  activation_max,
const int32_t  num_col_a,
const int32_t  aligned_num_col_a,
const int32_t *const  output_bias,
const int32_t  row_address_offset,
int8_t *  out_0 
)

Matrix-multiplication function for convolution with per-channel requantization, supporting an address offset between rows.

Parameters
[in]input_apointer to operand A
[in]input_bpointer to operand B, always consists of 2 vectors.
[in]output_chnumber of rows of A
[in]out_shiftpointer to per output channel requantization shift parameter.
[in]out_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]activation_minminimum value to clamp the output to. Range : int8
[in]activation_maxmaximum value to clamp the output to. Range : int8
[in]num_col_anumber of columns of A
[in]aligned_num_col_anumber of columns of A aligned by 4
[in]output_biasper output channel bias. Range : int32
[in]row_address_offsetaddress offset between rows in the output
[in,out]out_0pointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

    This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max.

This function is slighly less performant than arm_nn_mat_mult_kernel_s8_s16, but allows support for grouped convolution. Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_kernel_s4_s16()

int8_t * arm_nn_mat_mult_kernel_s4_s16 ( const int8_t *  input_a,
const int16_t *  input_b,
const uint16_t  output_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int32_t  activation_min,
const int32_t  activation_max,
const int32_t  num_col_a,
const int32_t *const  output_bias,
int8_t *  out_0 
)

Matrix-multiplication function for convolution with per-channel requantization and 4 bit weights.

Parameters
[in]input_apointer to operand A, int8 packed with 2x int4.
[in]input_bpointer to operand B, always consists of 2 vectors.
[in]output_chnumber of rows of A
[in]out_shiftpointer to per output channel requantization shift parameter.
[in]out_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]activation_minminimum value to clamp the output to. Range : int8
[in]activation_maxmaximum value to clamp the output to. Range : int8
[in]num_col_anumber of columns of A
[in]output_biasper output channel bias. Range : int32
[in,out]out_0pointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

    This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_kernel_s8_s16()

int8_t * arm_nn_mat_mult_kernel_s8_s16 ( const int8_t *  input_a,
const int16_t *  input_b,
const uint16_t  output_ch,
const int32_t *  out_shift,
const int32_t *  out_mult,
const int32_t  out_offset,
const int16_t  activation_min,
const int16_t  activation_max,
const int32_t  num_col_a,
const int32_t  aligned_num_col_a,
const int32_t *const  output_bias,
int8_t *  out_0 
)

Matrix-multiplication function for convolution with per-channel requantization.

Parameters
[in]input_apointer to operand A
[in]input_bpointer to operand B, always consists of 2 vectors.
[in]output_chnumber of rows of A
[in]out_shiftpointer to per output channel requantization shift parameter.
[in]out_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]activation_minminimum value to clamp the output to. Range : int8
[in]activation_maxmaximum value to clamp the output to. Range : int8
[in]num_col_anumber of columns of A
[in]aligned_num_col_anumber of columns of A aligned by 4
[in]output_biasper output channel bias. Range : int32
[in,out]out_0pointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

    This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.

◆ arm_nn_mat_mult_s8()

int8_t * arm_nn_mat_mult_s8 ( const int8_t *  input_row,
const int8_t *  input_col,
const uint16_t  output_ch,
const uint16_t  col_batches,
const int32_t *  output_shift,
const int32_t *  output_mult,
const int32_t  out_offset,
const int32_t  col_offset,
const int32_t  row_offset,
const int16_t  out_activation_min,
const int16_t  out_activation_max,
const uint16_t  row_len,
const int32_t *const  bias,
int8_t *  out 
)

General Matrix-multiplication function with per-channel requantization.

Parameters
[in]input_rowpointer to row operand
[in]input_colpointer to col operand
[in]output_chnumber of rows of input_row
[in]col_batchesnumber of column batches. Range: 1 to 4
[in]output_shiftpointer to per output channel requantization shift parameter.
[in]output_multpointer to per output channel requantization multiplier parameter.
[in]out_offsetoutput tensor offset.
[in]col_offsetinput tensor(col) offset.
[in]row_offsetkernel offset(row). Not used.
[in]out_activation_minminimum value to clamp the output to. Range : int8
[in]out_activation_maxmaximum value to clamp the output to. Range : int8
[in]row_lennumber of elements in each row
[in]biasper output channel bias. Range : int32
[in,out]outpointer to output
Returns
The function returns one of the two
  1. The incremented output pointer for a successful operation or
  2. NULL if implementation is not available.

    Supported framework: TensorFlow Lite