CMSIS-NN
Version 3.1.0
CMSIS NN Software Library
|
Data Structures | |
union | arm_nnword |
Union for SIMD access of q31/q15/q7 types. More... | |
struct | arm_nn_double |
Union for data type long long. More... | |
union | arm_nn_long_long |
Macros | |
#define | LEFT_SHIFT(_shift) |
#define | RIGHT_SHIFT(_shift) |
#define | MASK_IF_ZERO(x) |
#define | MASK_IF_NON_ZERO(x) |
#define | SELECT_USING_MASK(mask, a, b) |
#define | MAX(A, B) |
#define | MIN(A, B) |
#define | CLAMP(x, h, l) |
#define | REDUCE_MULTIPLIER(_mult) |
#define | PACK_Q7x4_32x1(v0, v1, v2, v3) |
definition to pack four 8 bit values. More... | |
#define | NN_ROUND(out_shift) |
macro for adding rounding offset More... | |
#define | MUL_SAT(a, b) |
#define | MUL_SAT_MVE(a, b) |
#define | MUL_POW2(a, b) |
#define | DIV_POW2(a, b) |
#define | DIV_POW2_MVE(a, b) |
#define | EXP_ON_NEG(x) |
#define | ONE_OVER1(x) |
#define | SELECT_IF_NON_ZERO(x) |
Functions | |
void | arm_q7_to_q15_no_shift (const q7_t *pSrc, q15_t *pDst, uint32_t blockSize) |
Converts the elements of the q7 vector to q15 vector without left-shift. More... | |
void | arm_nn_add_q7 (const q7_t *input, q31_t *output, uint32_t block_size) |
Non-saturating addition of elements of a q7 vector. More... | |
void | arm_q7_to_q15_reordered_no_shift (const q7_t *pSrc, q15_t *pDst, uint32_t blockSize) |
Converts the elements of the q7 vector to reordered q15 vector without left-shift. More... | |
void | arm_q7_to_q15_with_offset (const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) |
Converts the elements from a q7 vector to a q15 vector with an added offset. More... | |
void | arm_q7_to_q15_reordered_with_offset (const q7_t *src, q15_t *dst, uint32_t block_size, q15_t offset) |
Converts the elements of the q7 vector to reordered q15 vector with an added offset. More... | |
void | arm_nn_accumulate_q7_to_q15 (q15_t *dst, const q7_t *src, uint32_t block_size) |
Converts the elements from a q7 vector and accumulate to a q15 vector. More... | |
q7_t * | arm_nn_depthwise_conv_s8_core (const q7_t *row, const q15_t *col, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t kernel_size, const int32_t *const output_bias, q7_t *out) |
Depthwise conv on an im2col buffer where the input channel equals output channel. More... | |
q7_t * | arm_nn_mat_mult_s8 (const q7_t *input_row, const q7_t *input_col, const uint16_t output_ch, const uint16_t col_batches, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t col_offset, const int32_t row_offset, const int16_t out_activation_min, const int16_t out_activation_max, const uint16_t row_len, const int32_t *const bias, q7_t *out) |
General Matrix-multiplication function with per-channel requantization. More... | |
q15_t * | arm_nn_mat_mult_kernel_s16 (const q7_t *input_a, const q15_t *input_b, const int32_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int16_t activation_min, const int16_t activation_max, const int32_t num_col_a, const int64_t *const output_bias, q15_t *out_0) |
Matrix-multiplication function for convolution with per-channel requantization for 16 bits convolution. More... | |
arm_status | arm_nn_mat_mul_core_1x_s8 (int32_t row_elements, const int8_t *row_base, const int8_t *col_base, int32_t *const sum_col, int32_t *const output) |
General Matrix-multiplication without requantization for one row & one column. More... | |
int8_t * | arm_nn_mat_mul_core_4x_s8 (const int32_t row_elements, const int32_t offset, const int8_t *row_base, const int8_t *col_base, const int32_t out_ch, const cmsis_nn_conv_params *conv_params, const cmsis_nn_per_channel_quant_params *quant_params, const int32_t *bias, int8_t *output) |
Matrix-multiplication with requantization & activation function for four rows and one column. More... | |
arm_status | arm_nn_mat_mult_nt_t_s8 (const q7_t *lhs, const q7_t *rhs, const q31_t *bias, q7_t *dst, const int32_t *dst_multipliers, const int32_t *dst_shifts, const int32_t lhs_rows, const int32_t rhs_rows, const int32_t rhs_cols, const int32_t lhs_offset, const int32_t dst_offset, const int32_t activation_min, const int32_t activation_max) |
General Matrix-multiplication function with per-channel requantization. This function assumes: More... | |
arm_status | arm_nn_vec_mat_mult_t_s8 (const q7_t *lhs, const q7_t *rhs, const q31_t *bias, q7_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t dst_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max, const int32_t address_offset) |
s8 Vector by Matrix (transposed) multiplication More... | |
arm_status | arm_nn_vec_mat_mult_t_s16 (const q15_t *lhs, const q7_t *rhs, const q63_t *bias, q15_t *dst, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max) |
s16 Vector by Matrix (transposed) multiplication More... | |
arm_status | arm_nn_vec_mat_mult_t_svdf_s8 (const q7_t *lhs, const q7_t *rhs, q15_t *dst, const int32_t lhs_offset, const int32_t rhs_offset, const int32_t scatter_offset, const int32_t dst_multiplier, const int32_t dst_shift, const int32_t rhs_cols, const int32_t rhs_rows, const int32_t activation_min, const int32_t activation_max) |
s8 Vector by Matrix (transposed) multiplication with s16 output More... | |
q7_t * | arm_nn_depthwise_conv_nt_t_padded_s8 (const q7_t *lhs, const q7_t *rhs, const int32_t lhs_offset, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, q7_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in padded cases where the padding is -lhs_offset(Range: int8). Dimensions are the same for lhs and rhs. More... | |
q7_t * | arm_nn_depthwise_conv_nt_t_s8 (const q7_t *lhs, const q7_t *rhs, const int32_t lhs_offset, const uint16_t num_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int32_t activation_min, const int32_t activation_max, const uint16_t row_x_col, const int32_t *const output_bias, q7_t *out) |
Depthwise convolution of transposed rhs matrix with 4 lhs matrices. To be used in non-padded cases. Dimensions are the same for lhs and rhs. More... | |
q7_t * | arm_nn_mat_mult_kernel_q7_q15_reordered (const q7_t *pA, const q15_t *pInBuffer, const uint16_t ch_im_out, const uint16_t numCol_A, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut) |
Matrix-multiplication function for convolution with reordered columns. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_read_q15x2_ia (const q15_t **in_q15) |
Read 2 q15 elements and post increment pointer. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_read_q7x4_ia (const q7_t **in_q7) |
Read 4 q7 from q7 pointer and post increment pointer. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_read_q15x2 (const q15_t *in_q15) |
Read 2 q15 from q15 pointer. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_read_q7x4 (const q7_t *in_q7) |
Read 4 q7 values. More... | |
__STATIC_FORCEINLINE void | arm_nn_write_q7x4_ia (q7_t **in, q31_t value) |
Write four q7 to q7 pointer and increment pointer afterwards. More... | |
__STATIC_FORCEINLINE void | arm_memset_q7 (q7_t *dst, const q7_t val, uint32_t block_size) |
memset optimized for MVE More... | |
void | arm_nn_mult_q15 (q15_t *pSrcA, q15_t *pSrcB, q15_t *pDst, const uint16_t out_shift, uint32_t blockSize) |
q7 vector multiplication with variable output shifts More... | |
void | arm_nn_mult_q7 (q7_t *pSrcA, q7_t *pSrcB, q7_t *pDst, const uint16_t out_shift, uint32_t blockSize) |
q7 vector multiplication with variable output shifts More... | |
q7_t * | arm_nn_mat_mult_kernel_s8_s16 (const q7_t *input_a, const q15_t *input_b, const uint16_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int16_t activation_min, const int16_t activation_max, const uint16_t num_col_a, const int32_t *const output_bias, q7_t *out_0) |
Matrix-multiplication function for convolution with per-channel requantization. More... | |
void | arm_nn_softmax_common_s8 (const int8_t *input, const int32_t num_rows, const int32_t row_size, const int32_t mult, const int32_t shift, const int32_t diff_min, const bool int16_output, void *output) |
Common softmax function for s8 input and s8 or s16 output. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_doubling_high_mult (const q31_t m1, const q31_t m2) |
Saturating doubling high multiply. Result matches NEON instruction VQRDMULH. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_doubling_high_mult_no_sat (const q31_t m1, const q31_t m2) |
Doubling high multiply without saturation. This is intended for requantization where the scale is a positive integer. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_divide_by_power_of_two (const q31_t dividend, const q31_t exponent) |
Rounding divide by power of two. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_requantize (const q31_t val, const q31_t multiplier, const q31_t shift) |
Requantize a given value. More... | |
__STATIC_FORCEINLINE q31_t | arm_nn_requantize_s64 (const q63_t val, const q31_t reduced_multiplier, const q31_t shift) |
Requantize a given 64 bit value. More... | |
__STATIC_FORCEINLINE void | arm_memcpy_q7 (q7_t *__RESTRICT dst, const q7_t *__RESTRICT src, uint32_t block_size) |
memcpy optimized for MVE More... | |
__STATIC_FORCEINLINE int32_t | arm_nn_exp_on_negative_values (int32_t val) |
__STATIC_FORCEINLINE q31_t | arm_nn_mult_by_power_of_two (const int32_t val, const int32_t exp) |
__STATIC_FORCEINLINE int32_t | arm_nn_one_over_one_plus_x_for_x_in_0_1 (int32_t val) |
__STATIC_FORCEINLINE void | arm_nn_write_q15x2_ia (q15_t **dest_q15, q31_t src_q31) |
Write 2 q15 elements and post increment pointer. More... | |
#define CLAMP | ( | x, | |
h, | |||
l | |||
) |
Referenced by arm_nn_softmax_common_s8(), arm_softmax_s8(), arm_softmax_u8(), arm_svdf_s8(), and arm_svdf_state_s16_s8().
#define DIV_POW2 | ( | a, | |
b | |||
) |
Referenced by arm_nn_exp_on_negative_values(), arm_nn_softmax_common_s8(), arm_softmax_s8(), and arm_softmax_u8().
#define DIV_POW2_MVE | ( | a, | |
b | |||
) |
Referenced by arm_softmax_s8().
#define EXP_ON_NEG | ( | x | ) |
Referenced by arm_nn_softmax_common_s8(), arm_softmax_s8(), and arm_softmax_u8().
#define LEFT_SHIFT | ( | _shift | ) |
Referenced by arm_nn_requantize().
#define MASK_IF_NON_ZERO | ( | x | ) |
Referenced by arm_nn_mult_by_power_of_two().
#define MASK_IF_ZERO | ( | x | ) |
Referenced by arm_nn_exp_on_negative_values().
#define MAX | ( | A, | |
B | |||
) |
Referenced by __attribute__(), arm_avgpool_s16(), arm_avgpool_s8(), arm_convolve_1_x_n_s8(), arm_convolve_1x1_s8_fast(), arm_convolve_fast_s16(), arm_convolve_s16(), arm_convolve_s8(), arm_depthwise_conv_3x3_s8(), arm_depthwise_conv_s8_opt(), arm_elementwise_add_s16(), arm_elementwise_add_s8(), arm_elementwise_mul_s16(), arm_elementwise_mul_s8(), arm_max_pool_s16(), arm_max_pool_s8(), arm_nn_mat_mult_kernel_s16(), arm_nn_mat_mult_kernel_s8_s16(), arm_nn_mat_mult_nt_t_s8(), arm_nn_mat_mult_s8(), arm_nn_softmax_common_s8(), arm_nn_vec_mat_mult_t_s16(), arm_nn_vec_mat_mult_t_s8(), arm_nn_vec_mat_mult_t_svdf_s8(), arm_relu6_s8(), arm_softmax_s16(), arm_softmax_u8(), clamp_output(), depthwise_conv_s16_generic_s16(), depthwise_conv_s8_generic(), depthwise_conv_s8_mult_4(), depthwise_conv_u8_generic(), and depthwise_conv_u8_mult_4().
#define MIN | ( | A, | |
B | |||
) |
Referenced by __attribute__(), arm_avgpool_s16(), arm_avgpool_s8(), arm_convolve_1_x_n_s8(), arm_convolve_1x1_s8_fast(), arm_convolve_fast_s16(), arm_convolve_s16(), arm_convolve_s8(), arm_depthwise_conv_3x3_s8(), arm_depthwise_conv_s8_opt(), arm_elementwise_add_s16(), arm_elementwise_add_s8(), arm_elementwise_mul_s16(), arm_elementwise_mul_s8(), arm_max_pool_s16(), arm_max_pool_s8(), arm_nn_mat_mult_kernel_s16(), arm_nn_mat_mult_kernel_s8_s16(), arm_nn_mat_mult_nt_t_s8(), arm_nn_mat_mult_s8(), arm_nn_vec_mat_mult_t_s16(), arm_nn_vec_mat_mult_t_s8(), arm_nn_vec_mat_mult_t_svdf_s8(), arm_relu6_s8(), arm_softmax_s16(), clamp_output(), depthwise_conv_s16_generic_s16(), depthwise_conv_s8_generic(), depthwise_conv_s8_mult_4(), depthwise_conv_u8_generic(), and depthwise_conv_u8_mult_4().
#define MUL_POW2 | ( | a, | |
b | |||
) |
Referenced by arm_nn_one_over_one_plus_x_for_x_in_0_1().
#define MUL_SAT | ( | a, | |
b | |||
) |
#define MUL_SAT_MVE | ( | a, | |
b | |||
) |
Referenced by arm_softmax_s8().
#define NN_ROUND | ( | out_shift | ) |
Referenced by arm_convolve_1x1_HWC_q7_fast_nonsquare(), arm_convolve_HWC_q15_basic(), arm_convolve_HWC_q15_fast(), arm_convolve_HWC_q15_fast_nonsquare(), arm_convolve_HWC_q7_basic(), arm_convolve_HWC_q7_basic_nonsquare(), arm_convolve_HWC_q7_fast(), arm_convolve_HWC_q7_fast_nonsquare(), arm_convolve_HWC_q7_RGB(), arm_depthwise_separable_conv_HWC_q7(), arm_depthwise_separable_conv_HWC_q7_nonsquare(), arm_fully_connected_mat_q7_vec_q15(), arm_fully_connected_mat_q7_vec_q15_opt(), arm_fully_connected_q15(), arm_fully_connected_q15_opt(), arm_fully_connected_q7(), arm_fully_connected_q7_opt(), arm_nn_mat_mult_kernel_q7_q15(), arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_nn_mult_q15(), and arm_nn_mult_q7().
#define ONE_OVER1 | ( | x | ) |
Referenced by arm_nn_softmax_common_s8(), arm_softmax_s8(), and arm_softmax_u8().
#define PACK_Q7x4_32x1 | ( | v0, | |
v1, | |||
v2, | |||
v3 | |||
) |
Referenced by arm_elementwise_add_s8(), and arm_elementwise_mul_s8().
#define REDUCE_MULTIPLIER | ( | _mult | ) |
#define RIGHT_SHIFT | ( | _shift | ) |
Referenced by arm_nn_requantize().
#define SELECT_IF_NON_ZERO | ( | x | ) |
Referenced by arm_nn_exp_on_negative_values().
#define SELECT_USING_MASK | ( | mask, | |
a, | |||
b | |||
) |
Referenced by arm_nn_exp_on_negative_values(), and arm_nn_mult_by_power_of_two().
__STATIC_FORCEINLINE void arm_memcpy_q7 | ( | q7_t *__RESTRICT | dst, |
const q7_t *__RESTRICT | src, | ||
uint32_t | block_size | ||
) |
[in,out] | dst | Destination pointer |
[in] | src | Source pointer. |
[in] | block_size | Number of bytes to copy. |
Referenced by arm_concatenation_s8_w(), arm_concatenation_s8_x(), arm_concatenation_s8_y(), arm_concatenation_s8_z(), arm_convolve_fast_s16(), arm_convolve_HWC_q7_RGB(), arm_convolve_s8(), arm_depthwise_conv_s8_opt(), arm_max_pool_s8(), and arm_reshape_s8().
__STATIC_FORCEINLINE void arm_memset_q7 | ( | q7_t * | dst, |
const q7_t | val, | ||
uint32_t | block_size | ||
) |
[in,out] | dst | Destination pointer |
[in] | val | Value to set |
[in] | block_size | Number of bytes to copy. |
Referenced by arm_convolve_fast_s16(), arm_convolve_HWC_q7_RGB(), and arm_depthwise_conv_s8_opt().
q7_t* arm_nn_depthwise_conv_s8_core | ( | const q7_t * | row, |
const q15_t * | col, | ||
const uint16_t | num_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int32_t | activation_min, | ||
const int32_t | activation_max, | ||
const uint16_t | kernel_size, | ||
const int32_t *const | output_bias, | ||
q7_t * | out | ||
) |
[in] | row | pointer to row |
[in] | col | pointer to im2col buffer, always consists of 2 columns. |
[in] | num_ch | number of channels |
[in] | out_shift | pointer to per output channel requantization shift parameter. |
[in] | out_mult | pointer to per output channel requantization multiplier parameter. |
[in] | out_offset | output tensor offset. |
[in] | activation_min | minimum value to clamp the output to. Range : int8 |
[in] | activation_max | maximum value to clamp the output to. Range : int8 |
[in] | kernel_size | number of elements in one column. |
[in] | output_bias | per output channel bias. Range : int32 |
[out] | out | pointer to output |
Supported framework: TensorFlow Lite micro.
__STATIC_FORCEINLINE q31_t arm_nn_divide_by_power_of_two | ( | const q31_t | dividend, |
const q31_t | exponent | ||
) |
[in] | dividend | - Dividend |
[in] | exponent | - Divisor = power(2, exponent) Range: [0, 31] |
Referenced by arm_nn_requantize().
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult | ( | const q31_t | m1, |
const q31_t | m2 | ||
) |
[in] | m1 | Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX} |
[in] | m2 | Multiplier. Range: {NN_Q31_MIN, NN_Q31_MAX} |
References NN_Q31_MAX, and NN_Q31_MIN.
__STATIC_FORCEINLINE q31_t arm_nn_doubling_high_mult_no_sat | ( | const q31_t | m1, |
const q31_t | m2 | ||
) |
[in] | m1 | Multiplicand. Range: {NN_Q31_MIN, NN_Q31_MAX} |
[in] | m2 | Multiplier Range: {NN_Q31_MIN, NN_Q31_MAX} |
References arm_nn_double::high, arm_nn_long_long::long_long, arm_nn_double::low, and arm_nn_long_long::word.
Referenced by arm_nn_requantize().
__STATIC_FORCEINLINE int32_t arm_nn_exp_on_negative_values | ( | int32_t | val | ) |
References DIV_POW2, MASK_IF_ZERO, MUL_SAT, NN_Q31_MAX, SELECT_IF_NON_ZERO, and SELECT_USING_MASK.
q7_t* arm_nn_mat_mult_kernel_q7_q15_reordered | ( | const q7_t * | pA, |
const q15_t * | pInBuffer, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | numCol_A, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
const q7_t * | bias, | ||
q7_t * | pOut | ||
) |
[in] | pA | pointer to operand A |
[in] | pInBuffer | pointer to operand B, always conssists of 2 vectors |
[in] | ch_im_out | numRow of A |
[in] | numCol_A | numCol of A |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in] | bias | the bias |
[in,out] | pOut | pointer to output |
This function assumes that data in pInBuffer are reordered
Matrix-multiplication function for convolution with reordered columns.
Refer to header file for details.
References arm_nn_read_q15x2_ia(), and NN_ROUND.
Referenced by arm_convolve_1x1_HWC_q7_fast_nonsquare(), arm_convolve_HWC_q7_fast(), and arm_convolve_HWC_q7_fast_nonsquare().
q15_t* arm_nn_mat_mult_kernel_s16 | ( | const q7_t * | input_a, |
const q15_t * | input_b, | ||
const int32_t | output_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int16_t | activation_min, | ||
const int16_t | activation_max, | ||
const int32_t | num_col_a, | ||
const int64_t *const | output_bias, | ||
q15_t * | out_0 | ||
) |
[in] | input_a | pointer to operand A |
[in] | input_b | pointer to operand B, always consists of 2 vectors. |
[in] | output_ch | number of rows of A |
[in] | out_shift | pointer to per output channel requantization shift parameter. |
[in] | out_mult | pointer to per output channel requantization multiplier parameter. |
[in] | activation_min | minimum value to clamp the output to. Range : int16 |
[in] | activation_max | maximum value to clamp the output to. Range : int16 |
[in] | num_col_a | number of columns of A |
[in] | output_bias | per output channel bias. Range : int64 |
[in,out] | out_0 | pointer to output |
This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.
References arm_nn_read_q15x2_ia(), arm_nn_requantize(), arm_nn_requantize_s64(), MAX, MIN, and REDUCE_MULTIPLIER.
Referenced by arm_convolve_fast_s16().
q7_t* arm_nn_mat_mult_kernel_s8_s16 | ( | const q7_t * | input_a, |
const q15_t * | input_b, | ||
const uint16_t | output_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int16_t | activation_min, | ||
const int16_t | activation_max, | ||
const uint16_t | num_col_a, | ||
const int32_t *const | output_bias, | ||
q7_t * | out_0 | ||
) |
[in] | input_a | pointer to operand A |
[in] | input_b | pointer to operand B, always consists of 2 vectors. |
[in] | output_ch | number of rows of A |
[in] | out_shift | pointer to per output channel requantization shift parameter. |
[in] | out_mult | pointer to per output channel requantization multiplier parameter. |
[in] | out_offset | output tensor offset. |
[in] | activation_min | minimum value to clamp the output to. Range : int8 |
[in] | activation_max | maximum value to clamp the output to. Range : int8 |
[in] | num_col_a | number of columns of A |
[in] | output_bias | per output channel bias. Range : int32 |
[in,out] | out_0 | pointer to output |
This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.
References arm_nn_read_q15x2_ia(), arm_nn_requantize(), MAX, and MIN.
Referenced by arm_convolve_s8().
q7_t* arm_nn_mat_mult_s8 | ( | const q7_t * | input_row, |
const q7_t * | input_col, | ||
const uint16_t | output_ch, | ||
const uint16_t | col_batches, | ||
const int32_t * | output_shift, | ||
const int32_t * | output_mult, | ||
const int32_t | out_offset, | ||
const int32_t | col_offset, | ||
const int32_t | row_offset, | ||
const int16_t | out_activation_min, | ||
const int16_t | out_activation_max, | ||
const uint16_t | row_len, | ||
const int32_t *const | bias, | ||
q7_t * | out | ||
) |
[in] | input_row | pointer to row operand |
[in] | input_col | pointer to col operand |
[in] | output_ch | number of rows of input_row |
[in] | col_batches | number of column batches. Range: 1 to 4 |
[in] | output_shift | pointer to per output channel requantization shift parameter. |
[in] | output_mult | pointer to per output channel requantization multiplier parameter. |
[in] | out_offset | output tensor offset. |
[in] | col_offset | input tensor(col) offset. |
[in] | row_offset | kernel offset(row). Not used. |
[in] | out_activation_min | minimum value to clamp the output to. Range : int8 |
[in] | out_activation_max | maximum value to clamp the output to. Range : int8 |
[in] | row_len | number of elements in each row |
[in] | bias | per output channel bias. Range : int32 |
[in,out] | out | pointer to output |
Supported framework: TensorFlow Lite
References arm_nn_requantize(), MAX, and MIN.
Referenced by arm_convolve_s8().
__STATIC_FORCEINLINE q31_t arm_nn_mult_by_power_of_two | ( | const int32_t | val, |
const int32_t | exp | ||
) |
References MASK_IF_NON_ZERO, NN_Q31_MAX, NN_Q31_MIN, and SELECT_USING_MASK.
__STATIC_FORCEINLINE int32_t arm_nn_one_over_one_plus_x_for_x_in_0_1 | ( | int32_t | val | ) |
References MUL_POW2, MUL_SAT, and NN_Q31_MAX.
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2 | ( | const q15_t * | in_q15 | ) |
[in] | in_q15 | pointer to address of input. |
Referenced by arm_depthwise_conv_s8_opt(), arm_nn_accumulate_q7_to_q15(), clamp_output(), and compare_and_replace_if_larger().
__STATIC_FORCEINLINE q31_t arm_nn_read_q15x2_ia | ( | const q15_t ** | in_q15 | ) |
[in] | in_q15 | Pointer to pointer that holds address of input. |
Referenced by arm_convolve_1x1_HWC_q7_fast_nonsquare(), arm_convolve_fast_s16(), arm_convolve_HWC_q15_basic(), arm_convolve_HWC_q15_fast(), arm_convolve_HWC_q15_fast_nonsquare(), arm_convolve_HWC_q7_basic(), arm_convolve_HWC_q7_basic_nonsquare(), arm_convolve_HWC_q7_fast(), arm_convolve_HWC_q7_fast_nonsquare(), arm_convolve_HWC_q7_RGB(), arm_convolve_s8(), arm_fully_connected_mat_q7_vec_q15(), arm_fully_connected_mat_q7_vec_q15_opt(), arm_fully_connected_q15(), arm_fully_connected_q15_opt(), arm_fully_connected_q7(), arm_fully_connected_q7_opt(), arm_nn_mat_mult_kernel_q7_q15(), arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_nn_mat_mult_kernel_s16(), arm_nn_mat_mult_kernel_s8_s16(), arm_nn_vec_mat_mult_t_s16(), arm_relu_q15(), arm_svdf_state_s16_s8(), and compare_and_replace_if_larger().
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4 | ( | const q7_t * | in_q7 | ) |
[in] | in_q7 | pointer to address of input. |
Referenced by arm_convolve_HWC_q7_RGB(), arm_depthwise_conv_3x3_s8(), arm_depthwise_conv_s8_opt(), arm_depthwise_separable_conv_HWC_q7(), arm_depthwise_separable_conv_HWC_q7_nonsquare(), arm_nn_mat_mult_nt_t_s8(), clamp_output(), and compare_and_replace_if_larger_q7().
__STATIC_FORCEINLINE q31_t arm_nn_read_q7x4_ia | ( | const q7_t ** | in_q7 | ) |
[in] | in_q7 | Pointer to pointer that holds address of input. |
Referenced by arm_fully_connected_mat_q7_vec_q15_opt(), arm_fully_connected_q7_opt(), arm_nn_accumulate_q7_to_q15(), arm_nn_add_q7(), arm_nn_mat_mult_nt_t_s8(), arm_nn_vec_mat_mult_t_s8(), arm_nn_vec_mat_mult_t_svdf_s8(), arm_q7_to_q15_no_shift(), arm_q7_to_q15_reordered_no_shift(), arm_q7_to_q15_reordered_with_offset(), arm_q7_to_q15_with_offset(), arm_relu_q7(), and compare_and_replace_if_larger_q7().
__STATIC_FORCEINLINE q31_t arm_nn_requantize | ( | const q31_t | val, |
const q31_t | multiplier, | ||
const q31_t | shift | ||
) |
[in] | val | Value to be requantized |
[in] | multiplier | multiplier. Range {NN_Q31_MIN + 1, Q32_MAX} |
[in] | shift | left or right shift for 'val * multiplier' |
References arm_nn_divide_by_power_of_two(), arm_nn_doubling_high_mult_no_sat(), LEFT_SHIFT, and RIGHT_SHIFT.
Referenced by arm_convolve_1x1_s8_fast(), arm_convolve_fast_s16(), arm_convolve_s8(), arm_depthwise_conv_3x3_s8(), arm_depthwise_conv_s8_opt(), arm_elementwise_add_s16(), arm_elementwise_add_s8(), arm_elementwise_mul_s16(), arm_elementwise_mul_s8(), arm_nn_mat_mult_kernel_s16(), arm_nn_mat_mult_kernel_s8_s16(), arm_nn_mat_mult_nt_t_s8(), arm_nn_mat_mult_s8(), arm_nn_vec_mat_mult_t_s8(), arm_nn_vec_mat_mult_t_svdf_s8(), arm_softmax_s16(), arm_svdf_s8(), arm_svdf_state_s16_s8(), depthwise_conv_s8_generic(), depthwise_conv_s8_mult_4(), depthwise_conv_u8_generic(), and depthwise_conv_u8_mult_4().
__STATIC_FORCEINLINE q31_t arm_nn_requantize_s64 | ( | const q63_t | val, |
const q31_t | reduced_multiplier, | ||
const q31_t | shift | ||
) |
[in] | val | Value to be requantized in the range {-(1<<47)} to {(1<<47) - 1} |
[in] | reduced_multiplier | Reduced multiplier in the range {NN_Q31_MIN + 1, Q32_MAX} to {Q16_MIN + 1, Q16_MAX} |
[in] | shift | Left or right shift for 'val * multiplier' in the range {-31} to {7} |
Referenced by __attribute__(), arm_convolve_fast_s16(), arm_convolve_s16(), arm_nn_mat_mult_kernel_s16(), arm_nn_vec_mat_mult_t_s16(), and depthwise_conv_s16_generic_s16().
__STATIC_FORCEINLINE void arm_nn_write_q15x2_ia | ( | q15_t ** | dest_q15, |
q31_t | src_q31 | ||
) |
[in] | dest_q15 | Pointer to pointer that holds address of destination. |
[in] | src_q31 | Input value to be written. |
Referenced by arm_nn_accumulate_q7_to_q15(), arm_q7_to_q15_no_shift(), arm_q7_to_q15_reordered_with_offset(), arm_q7_to_q15_with_offset(), arm_relu_q15(), clamp_output(), and compare_and_replace_if_larger().
__STATIC_FORCEINLINE void arm_nn_write_q7x4_ia | ( | q7_t ** | in, |
q31_t | value | ||
) |
[in] | in | Double pointer to input value |
[in] | value | Four bytes to copy |
Referenced by arm_elementwise_add_s8(), arm_elementwise_mul_s8(), arm_q7_to_q15_reordered_no_shift(), arm_relu_q7(), clamp_output(), and compare_and_replace_if_larger_q7().