CMSIS-NN
CMSIS NN Software Library
|
Functions | |
arm_cmsis_nn_status | arm_nn_depthwise_conv_nt_t_padded_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. More... | |
int16_t * | arm_nn_depthwise_conv_nt_t_s16 (const int16_t *lhs, const int8_t *rhs, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int64_t *const output_bias, int16_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More... | |
arm_cmsis_nn_status | arm_nn_depthwise_conv_nt_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More... | |
arm_cmsis_nn_status | arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t *row_base_ref, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output) |
General Vector by Matrix multiplication with requantization and storage of result. More... | |
int8_t * | arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output) |
Matrix-multiplication with requantization & activation function for four rows and one column. More... | |
int16_t * | arm_nn_mat_mult_kernel_s16 (const int8_t *input_a, const int16_t *input_b, const int32_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int16_t activation_min, const int16_t activation_max, const int32_t num_col_a, const int64_t *const output_bias, int16_t *out_0) |
Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution. More... | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s4 (const int8_t *lhs, const int8_t *packed_rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset) |
General Matrix-multiplication function with per-channel requantization. This function assumes: More... | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset) |
General Matrix-multiplication function with per-channel requantization. This function assumes: More... | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s8_s32 (const int8_t *lhs, const int8_t *rhs, int32_t *dst, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_idx_offset) |
General Matrix-multiplication function with int8 input and int32 output. This function assumes: More... | |
Support functions for Convolution and DW Convolution
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t | lhs_offset, | ||
const int32_t | active_ch, | ||
const int32_t | total_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int32_t *const | output_bias, | ||
int8_t * | out | ||
) |
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed) |
[in] | lhs_offset | LHS matrix offset(input offset). Range: -127 to 128 |
[in] | active_ch | Subset of total_ch processed |
[in] | total_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels |
[in] | out_offset | Offset to be added to the output values. Range: -127 to 128 |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels |
[in] | out | Output pointer |
int16_t * arm_nn_depthwise_conv_nt_t_s16 | ( | const int16_t * | lhs, |
const int8_t * | rhs, | ||
const uint16_t | num_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int64_t *const | output_bias, | ||
int16_t * | out | ||
) |
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed) |
[in] | num_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels. |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels. |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels. |
[in] | out | Output pointer |
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t | lhs_offset, | ||
const int32_t | active_ch, | ||
const int32_t | total_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int32_t *const | output_bias, | ||
int8_t * | out | ||
) |
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed) |
[in] | lhs_offset | LHS matrix offset(input offset). Range: -127 to 128 |
[in] | active_ch | Subset of total_ch processed |
[in] | total_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels. |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels. |
[in] | out_offset | Offset to be added to the output values. Range: -127 to 128 |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels. |
[in] | out | Output pointer |
arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8 | ( | int32_t | row_elements, |
const int32_t | skipped_row_elements, | ||
const int8_t * | row_base_ref, | ||
const int8_t * | col_base_ref, | ||
const int32_t | out_ch, | ||
const cmsis_nn_conv_params * | conv_params, | ||
const cmsis_nn_per_channel_quant_params * | quant_params, | ||
const int32_t * | bias, | ||
int8_t * | output | ||
) |
[in] | row_elements | number of row elements |
[in] | skipped_row_elements | number of row elements skipped due to padding. row_elements + skipped_row_elements = (kernel_x * kernel_y) * input_ch |
[in] | row_base_ref | pointer to row operand |
[in] | col_base_ref | pointer to col operand |
[out] | out_ch | Number of output channels |
[in] | conv_params | Pointer to convolution parameters like offsets and activation values |
[in] | quant_params | Pointer to per-channel quantization parameters |
[in] | bias | Pointer to optional per-channel bias |
[out] | output | Pointer to output where int8 results are stored. |
Pseudo-code *output = 0 sum_col = 0 for (j = 0; j < out_ch; j++) for (i = 0; i < row_elements; i++) *output += row_base_ref[i] * col_base_ref[i] sum_col += col_base_ref[i] scale sum_col using quant_params and bias store result in 'output'
int8_t * arm_nn_mat_mul_core_4x_s8 | ( | const int32_t | row_elements, |
const int32_t | offset, | ||
const int8_t * | row_base, | ||
const int8_t * | col_base, | ||
const int32_t | out_ch, | ||
const cmsis_nn_conv_params * | conv_params, | ||
const cmsis_nn_per_channel_quant_params * | quant_params, | ||
const int32_t * | bias, | ||
int8_t * | output | ||
) |
[in] | row_elements | number of row elements |
[in] | offset | offset between rows. Can be the same as row_elements. For e.g, in a 1x1 conv scenario with stride as 1. |
[in] | row_base | pointer to row operand |
[in] | col_base | pointer to col operand |
[in] | out_ch | Number of output channels |
[in] | conv_params | Pointer to convolution parameters like offsets and activation values |
[in] | quant_params | Pointer to per-channel quantization parameters |
[in] | bias | Pointer to per-channel bias |
[out] | output | Pointer to output where int8 results are stored. |
Compliant to TFLM int8 specification. MVE implementation only
int16_t * arm_nn_mat_mult_kernel_s16 | ( | const int8_t * | input_a, |
const int16_t * | input_b, | ||
const int32_t | output_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int16_t | activation_min, | ||
const int16_t | activation_max, | ||
const int32_t | num_col_a, | ||
const int64_t *const | output_bias, | ||
int16_t * | out_0 | ||
) |
[in] | input_a | pointer to operand A |
[in] | input_b | pointer to operand B, always consists of 2 vectors. |
[in] | output_ch | number of rows of A |
[in] | out_shift | pointer to per output channel requantization shift parameter. |
[in] | out_mult | pointer to per output channel requantization multiplier parameter. |
[in] | activation_min | minimum value to clamp the output to. Range : int16 |
[in] | activation_max | maximum value to clamp the output to. Range : int16 |
[in] | num_col_a | number of columns of A |
[in] | output_bias | per output channel bias. Range : int64 |
[in,out] | out_0 | pointer to output |
NULL if implementation is not available.
This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t * | bias, | ||
int8_t * | dst, | ||
const int32_t * | dst_multipliers, | ||
const int32_t * | dst_shifts, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | lhs_offset, | ||
const int32_t | dst_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const int32_t | lhs_cols_offset | ||
) |
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[in] | bias | Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows) |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | dst_multipliers | Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | dst_shifts | Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of RHS input rows |
[in] | rhs_cols | Number of LHS/RHS input columns |
[in] | lhs_offset | Offset to be applied to the LHS input value |
[in] | dst_offset | Offset to be applied the output result |
[in] | activation_min | Minimum value to clamp down the output. Range : int8 |
[in] | activation_max | Maximum value to clamp up the output. Range : int8 |
[in] | lhs_cols_offset | Column offset between subsequent lhs_rows |
ARM_CMSIS_NN_SUCCESS
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t * | bias, | ||
int8_t * | dst, | ||
const int32_t * | dst_multipliers, | ||
const int32_t * | dst_shifts, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | lhs_offset, | ||
const int32_t | dst_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const int32_t | lhs_cols_offset | ||
) |
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[in] | bias | Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows) |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | dst_multipliers | Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | dst_shifts | Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of RHS input rows |
[in] | rhs_cols | Number of LHS/RHS input columns |
[in] | lhs_offset | Offset to be applied to the LHS input value |
[in] | dst_offset | Offset to be applied the output result |
[in] | activation_min | Minimum value to clamp down the output. Range : int8 |
[in] | activation_max | Maximum value to clamp up the output. Range : int8 |
[in] | lhs_cols_offset | Column offset between subsequent lhs_rows |
ARM_CMSIS_NN_SUCCESS
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8_s32 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
int32_t * | dst, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | lhs_offset, | ||
const int32_t | dst_idx_offset | ||
) |
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of LHS input columns/RHS input rows |
[in] | rhs_cols | Number of RHS input columns |
[in] | lhs_offset | Offset to be applied to the LHS input value |
[in] | dst_idx_offset | Offset between subsequent output results |
ARM_CMSIS_NN_SUCCESS