Functions | |
int16_t * | arm_nn_mat_mult_kernel_s16 (const int8_t *input_a, const int16_t *input_b, const int32_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, const int32_t num_col_a, const cmsis_nn_bias_data *const bias_data, int16_t *out_0) |
Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution. | |
arm_cmsis_nn_status | arm_nn_depthwise_conv_nt_t_padded_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. | |
int16_t * | arm_nn_depthwise_conv_nt_t_s16 (const int16_t *lhs, const int8_t *rhs, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int64_t *const output_bias, int16_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. | |
arm_cmsis_nn_status | arm_nn_depthwise_conv_nt_t_s4 (const int8_t *lhs, const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs consists of packed int4 data. Dimensions are the same for lhs and rhs. | |
arm_cmsis_nn_status | arm_nn_depthwise_conv_nt_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t input_offset, const int32_t active_ch, const int32_t total_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, int8_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. | |
arm_cmsis_nn_status | arm_nn_mat_mul_core_1x_s4 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t *row_base_ref, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output) |
General Vector by Matrix multiplication with requantization, storage of result and int4 weights packed into an int8 buffer. | |
arm_cmsis_nn_status | arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int32_t skipped_row_elements, const int8_t *row_base_ref, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output) |
General Vector by Matrix multiplication with requantization and storage of result. | |
int8_t * | arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base_ref, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output) |
Matrix-multiplication with requantization & activation function for four rows and one column. | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s16 (const int16_t *lhs, const int8_t *rhs, const cmsis_nn_bias_data *bias_data, int16_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t activation_min, const int32_t activation_max) |
General Matrix-multiplication function with per-channel requantization and int16 input (LHS) and output. This function assumes: | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s4 (const int8_t *lhs, const int8_t *packed_rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t lhs_cols_offset) |
General Matrix-multiplication function with per-channel requantization. This function assumes: | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s8 (const int8_t *lhs, const int8_t *rhs, const int32_t *bias, int8_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max, const int32_t row_address_offset, const int32_t lhs_cols_offset) |
General Matrix-multiplication function with per-channel requantization. This function assumes: | |
arm_cmsis_nn_status | arm_nn_mat_mult_nt_t_s8_s32 (const int8_t *lhs, const int8_t *rhs, int32_t *dst, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_idx_offset) |
General Matrix-multiplication function with int8 input and int32 output. This function assumes: | |
Support functions for Convolution and DW Convolution
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_padded_s8 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t | lhs_offset, | ||
const int32_t | active_ch, | ||
const int32_t | total_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int32_t *const | output_bias, | ||
int8_t * | out | ||
) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs.
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed) |
[in] | lhs_offset | LHS matrix offset(input offset). Range: -127 to 128 |
[in] | active_ch | Subset of total_ch processed |
[in] | total_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels |
[in] | out_offset | Offset to be added to the output values. Range: -127 to 128 |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels |
[in] | out | Output pointer |
int16_t * arm_nn_depthwise_conv_nt_t_s16 | ( | const int16_t * | lhs, |
const int8_t * | rhs, | ||
const uint16_t | num_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int64_t *const | output_bias, | ||
int16_t * | out | ||
) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed) |
[in] | num_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels. |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels. |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels. |
[in] | out | Output pointer |
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s4 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t | lhs_offset, | ||
const int32_t | active_ch, | ||
const int32_t | total_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int32_t *const | output_bias, | ||
int8_t * | out | ||
) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. rhs consists of packed int4 data. Dimensions are the same for lhs and rhs.
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed). Consists of int4 data packed in an int8 buffer. |
[in] | lhs_offset | LHS matrix offset(input offset). Range: -127 to 128 |
[in] | active_ch | Subset of total_ch processed |
[in] | total_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels. |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels. |
[in] | out_offset | Offset to be added to the output values. Range: -127 to 128 |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels. |
[in] | out | Output pointer |
arm_cmsis_nn_status arm_nn_depthwise_conv_nt_t_s8 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t | lhs_offset, | ||
const int32_t | active_ch, | ||
const int32_t | total_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | row_x_col, | ||
const int32_t *const | output_bias, | ||
int8_t * | out | ||
) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs.
[in] | lhs | Input left-hand side matrix |
[in] | rhs | Input right-hand side matrix (transposed) |
[in] | lhs_offset | LHS matrix offset(input offset). Range: -127 to 128 |
[in] | active_ch | Subset of total_ch processed |
[in] | total_ch | Number of channels in LHS/RHS |
[in] | out_shift | Per channel output shift. Length of vector is equal to number of channels. |
[in] | out_mult | Per channel output multiplier. Length of vector is equal to number of channels. |
[in] | out_offset | Offset to be added to the output values. Range: -127 to 128 |
[in] | activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | activation_max | Maximum value to clamp the output to. Range: int8 |
[in] | row_x_col | (row_dimension * col_dimension) of LHS/RHS matrix |
[in] | output_bias | Per channel output bias. Length of vector is equal to number of channels. |
[in] | out | Output pointer |
arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s4 | ( | int32_t | row_elements, |
const int32_t | skipped_row_elements, | ||
const int8_t * | row_base_ref, | ||
const int8_t * | col_base_ref, | ||
const int32_t | out_ch, | ||
const cmsis_nn_conv_params * | conv_params, | ||
const cmsis_nn_per_channel_quant_params * | quant_params, | ||
const int32_t * | bias, | ||
int8_t * | output | ||
) |
General Vector by Matrix multiplication with requantization, storage of result and int4 weights packed into an int8 buffer.
[in] | row_elements | number of row elements |
[in] | skipped_row_elements | number of row elements skipped due to padding. row_elements + skipped_row_elements = (kernel_x * kernel_y) * input_ch |
[in] | row_base_ref | pointer to row operand |
[in] | col_base_ref | pointer to col operand as packed int4 |
[out] | out_ch | Number of output channels |
[in] | conv_params | Pointer to convolution parameters like offsets and activation values |
[in] | quant_params | Pointer to per-channel quantization parameters |
[in] | bias | Pointer to optional per-channel bias |
[out] | output | Pointer to output where int8 results are stored. |
Pseudo-code as int8 example. Int4 filter data will be unpacked. *output = 0 sum_col = 0 for (j = 0; j < out_ch; j++) for (i = 0; i < row_elements; i++) *output += row_base_ref[i] * col_base_ref[i] sum_col += col_base_ref[i] scale sum_col using quant_params and bias store result in 'output'
arm_cmsis_nn_status arm_nn_mat_mul_core_1x_s8 | ( | int32_t | row_elements, |
const int32_t | skipped_row_elements, | ||
const int8_t * | row_base_ref, | ||
const int8_t * | col_base_ref, | ||
const int32_t | out_ch, | ||
const cmsis_nn_conv_params * | conv_params, | ||
const cmsis_nn_per_channel_quant_params * | quant_params, | ||
const int32_t * | bias, | ||
int8_t * | output | ||
) |
General Vector by Matrix multiplication with requantization and storage of result.
[in] | row_elements | number of row elements |
[in] | skipped_row_elements | number of row elements skipped due to padding. row_elements + skipped_row_elements = (kernel_x * kernel_y) * input_ch |
[in] | row_base_ref | pointer to row operand |
[in] | col_base_ref | pointer to col operand |
[out] | out_ch | Number of output channels |
[in] | conv_params | Pointer to convolution parameters like offsets and activation values |
[in] | quant_params | Pointer to per-channel quantization parameters |
[in] | bias | Pointer to optional per-channel bias |
[out] | output | Pointer to output where int8 results are stored. |
Pseudo-code *output = 0 sum_col = 0 for (j = 0; j < out_ch; j++) for (i = 0; i < row_elements; i++) *output += row_base_ref[i] * col_base_ref[i] sum_col += col_base_ref[i] scale sum_col using quant_params and bias store result in 'output'
int8_t * arm_nn_mat_mul_core_4x_s8 | ( | const int32_t | row_elements, |
const int32_t | offset, | ||
const int8_t * | row_base, | ||
const int8_t * | col_base, | ||
const int32_t | out_ch, | ||
const cmsis_nn_conv_params * | conv_params, | ||
const cmsis_nn_per_channel_quant_params * | quant_params, | ||
const int32_t * | bias, | ||
int8_t * | output | ||
) |
Matrix-multiplication with requantization & activation function for four rows and one column.
[in] | row_elements | number of row elements |
[in] | offset | offset between rows. Can be the same as row_elements. For e.g, in a 1x1 conv scenario with stride as 1. |
[in] | row_base | pointer to row operand |
[in] | col_base | pointer to col operand |
[in] | out_ch | Number of output channels |
[in] | conv_params | Pointer to convolution parameters like offsets and activation values |
[in] | quant_params | Pointer to per-channel quantization parameters |
[in] | bias | Pointer to per-channel bias |
[out] | output | Pointer to output where int8 results are stored. |
Compliant to TFLM int8 specification. MVE implementation only
int16_t * arm_nn_mat_mult_kernel_s16 | ( | const int8_t * | input_a, |
const int16_t * | input_b, | ||
const int32_t | output_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const int32_t | num_col_a, | ||
const cmsis_nn_bias_data *const | bias_data, | ||
int16_t * | out_0 | ||
) |
Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution.
[in] | input_a | pointer to operand A |
[in] | input_b | pointer to operand B, always consists of 2 vectors. |
[in] | output_ch | number of rows of A |
[in] | out_shift | pointer to per output channel requantization shift parameter. |
[in] | out_mult | pointer to per output channel requantization multiplier parameter. |
[in] | activation_min | minimum value to clamp the output to. Range : int16 |
[in] | activation_max | maximum value to clamp the output to. Range : int16 |
[in] | num_col_a | number of columns of A |
[in] | bias_data | pointer to struct with bias vector. The length of this vector is equal to the number of output columns (or RHS input rows). The vector can be int32 or int64 indicated by a flag in the struct. |
[in,out] | out_0 | pointer to output |
NULL if implementation is not available.
This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s16 | ( | const int16_t * | lhs, |
const int8_t * | rhs, | ||
const cmsis_nn_bias_data * | bias_data, | ||
int16_t * | dst, | ||
const int32_t * | dst_multipliers, | ||
const int32_t * | dst_shifts, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max | ||
) |
General Matrix-multiplication function with per-channel requantization and int16 input (LHS) and output. This function assumes:
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[in] | bias_data | Pointer to struct with bias vector. The length of this vector is equal to the number of output columns (or RHS input rows). The vector can be int32 or int64 indicated by a flag in the struct. |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | dst_multipliers | Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | dst_shifts | Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of RHS input rows |
[in] | rhs_cols | Number of LHS/RHS input columns |
[in] | activation_min | Minimum value to clamp down the output. Range : int16 |
[in] | activation_max | Maximum value to clamp up the output. Range : int16 |
MVE implementation only.
ARM_CMSIS_NN_SUCCESS
or ARM_CMSIS_NN_NO_IMPL_ERROR
if not for MVE arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s4 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t * | bias, | ||
int8_t * | dst, | ||
const int32_t * | dst_multipliers, | ||
const int32_t * | dst_shifts, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | lhs_offset, | ||
const int32_t | dst_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const int32_t | lhs_cols_offset | ||
) |
General Matrix-multiplication function with per-channel requantization. This function assumes:
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[in] | bias | Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows) |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | dst_multipliers | Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | dst_shifts | Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of RHS input rows |
[in] | rhs_cols | Number of LHS/RHS input columns |
[in] | lhs_offset | Offset to be applied to the LHS input value |
[in] | dst_offset | Offset to be applied the output result |
[in] | activation_min | Minimum value to clamp down the output. Range : int8 |
[in] | activation_max | Maximum value to clamp up the output. Range : int8 |
[in] | lhs_cols_offset | Column offset between subsequent lhs_rows |
ARM_CMSIS_NN_SUCCESS
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
const int32_t * | bias, | ||
int8_t * | dst, | ||
const int32_t * | dst_multipliers, | ||
const int32_t * | dst_shifts, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | lhs_offset, | ||
const int32_t | dst_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const int32_t | row_address_offset, | ||
const int32_t | lhs_cols_offset | ||
) |
General Matrix-multiplication function with per-channel requantization. This function assumes:
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[in] | bias | Pointer to the bias vector. The length of this vector is equal to the number of output columns (or RHS input rows) |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | dst_multipliers | Pointer to the multipliers vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | dst_shifts | Pointer to the shifts vector needed for the per-channel requantization. The length of this vector is equal to the number of output columns (or RHS input rows) |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of RHS input rows |
[in] | rhs_cols | Number of LHS/RHS input columns |
[in] | lhs_offset | Offset to be applied to the LHS input value |
[in] | dst_offset | Offset to be applied the output result |
[in] | activation_min | Minimum value to clamp down the output. Range : int8 |
[in] | activation_max | Maximum value to clamp up the output. Range : int8 |
[in] | row_address_offset | Address offset between rows in output. NOTE: Only used for MVEI extension. |
[in] | lhs_cols_offset | Column offset between subsequent lhs_rows |
ARM_CMSIS_NN_SUCCESS
arm_cmsis_nn_status arm_nn_mat_mult_nt_t_s8_s32 | ( | const int8_t * | lhs, |
const int8_t * | rhs, | ||
int32_t * | dst, | ||
const int32_t | lhs_rows, | ||
const int32_t | rhs_rows, | ||
const int32_t | rhs_cols, | ||
const int32_t | lhs_offset, | ||
const int32_t | dst_idx_offset | ||
) |
General Matrix-multiplication function with int8 input and int32 output. This function assumes:
[in] | lhs | Pointer to the LHS input matrix |
[in] | rhs | Pointer to the RHS input matrix |
[out] | dst | Pointer to the output matrix with "m" rows and "n" columns |
[in] | lhs_rows | Number of LHS input rows |
[in] | rhs_rows | Number of LHS input columns/RHS input rows |
[in] | rhs_cols | Number of RHS input columns |
[in] | lhs_offset | Offset to be applied to the LHS input value |
[in] | dst_idx_offset | Offset between subsequent output results |
ARM_CMSIS_NN_SUCCESS