44 struct DataTypeExtractor
49 if(std::is_same<T, int8_t>::value)
53 else if(std::is_same<T, uint8_t>::value)
57 else if(std::is_same<T, int16_t>::value)
65 template <
typename TIn,
typename TOut>
66 void quantize_down_scale(
const SimpleTensor<TIn> *in,
const SimpleTensor<TIn> *
bias, SimpleTensor<TOut> *
dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
67 std::vector<int32_t> result_shift, int32_t min, int32_t max)
69 const int cols_in = in->shape().x();
70 const bool is_per_channel = result_mult_int.size() > 1;
73 #pragma omp parallel for 75 for(
int i = 0; i < in->num_elements(); ++i)
77 int32_t result = ((*in)[i] + result_offset);
81 result += (*bias)[i % cols_in];
84 result *= (is_per_channel) ? result_mult_int[i % cols_in] : result_mult_int[0];
86 result >>= (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
91 result = std::max(min, std::min(max, result));
95 std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
99 template <
typename TIn,
typename TOut>
100 void quantize_down_scale_by_fixedpoint(
const SimpleTensor<TIn> *in,
const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
101 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
103 const int cols_in = in->shape().x();
104 const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
107 #pragma omp parallel for 109 for(
int i = 0; i < in->num_elements(); ++i)
111 TIn result = (*in)[i];
115 result += (*bias)[i % cols_in];
119 const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
120 const int32_t shift = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
130 result += result_offset_after_shift;
135 result = std::max(min, std::min(max, result));
139 std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
143 template <
typename TIn,
typename TOut>
144 void quantize_down_scale_by_float(
const SimpleTensor<TIn> *in,
const SimpleTensor<TIn> *bias, SimpleTensor<TOut> *dst, std::vector<float_t> result_real_multiplier,
145 int32_t result_offset, int32_t min, int32_t max)
147 const int cols_in = in->shape().x();
148 const bool is_per_channel = result_real_multiplier.size() > 1;
151 #pragma omp parallel for 153 for(
int i = 0; i < in->num_elements(); ++i)
155 TIn result = (*in)[i];
159 result += (*bias)[i % cols_in];
163 const float_t multiplier = (is_per_channel) ? result_real_multiplier[i % cols_in] : result_real_multiplier[0];
165 float_t result_f =
static_cast<float_t
>(result) * multiplier + static_cast<float_t>(result_offset);
171 result = std::max(min, std::min(max, result));
175 std::min<TIn>(std::numeric_limits<TOut>::max(), result)));
180 template <
typename T_out,
typename T_in,
typename T_in_1>
183 static_assert(std::is_same<
typename std::decay<T_out>::type, int32_t>::value,
"Only int32_t is allowed for the output");
188 const int K = a.
shape().x();
189 const int M = a.
shape().y();
190 const int N = b.
shape().x();
191 const int D = a.
shape().z();
193 const int a_stride_z = K *
M;
195 const int b_stride_z = b.
shape().num_dimensions() > 2 ? N *
K : 0;
196 const int c_stride_z = N *
M;
198 std::vector<T_out> acc;
201 for(
int depth = 0; depth < D; ++depth)
203 const int base_addr_a = depth * a_stride_z;
204 const int base_addr_b = depth * b_stride_z;
205 const int base_addr_c = depth * c_stride_z;
207 for(
int i = 0; i <
M; ++i)
209 for(
int j = 0; j <
N; ++j)
213 for(
int k = 0;
k <
K; ++
k)
215 const T_out tmp_a = a_offset +
static_cast<T_out
>(a[base_addr_a +
k + i *
K]);
216 for(
int j = 0; j <
N; ++j)
218 const T_out tmp_b = b_offset +
static_cast<T_out
>(b[base_addr_b + j +
k *
N]);
219 const T_out mult_as_int = tmp_a * tmp_b;
220 acc[j] += mult_as_int;
223 for(
int j = 0; j <
N; ++j)
225 c[base_addr_c + j + i *
N] = acc[j];
234 template <
typename T1,
typename T2,
typename T3>
237 return gemmlowp_matrix_multiply_core<T1, T2, T3>(a,
b, shape_c, 0, 0);
240 template <
typename TIn,
typename TOut>
242 int32_t min, int32_t max)
246 quantize_down_scale<TIn, TOut>(&in,
nullptr, &
dst, result_offset, result_mult_int, result_shift, min, max);
251 template <
typename TIn,
typename TOut>
253 std::vector<int32_t> result_shift, int32_t min, int32_t max)
257 quantize_down_scale<TIn, TOut>(&in, &
bias, &
dst, result_offset, result_mult_int, result_shift, min, max);
262 template <
typename TIn,
typename TOut>
264 int32_t result_offset_after_shift, int32_t min, int32_t max)
268 quantize_down_scale_by_fixedpoint<TIn, TOut>(&in,
nullptr, &
dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
273 template <
typename TIn,
typename TOut>
275 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
279 quantize_down_scale_by_fixedpoint<TIn, TOut>(&in, &
bias, &
dst, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max);
284 template <
typename TIn,
typename TOut>
286 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
290 quantize_down_scale_by_float<TIn, TOut>(&in, &
bias, &
dst, result_real_multiplier, result_offset, min, max);
295 template <
typename TIn,
typename TOut>
297 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
301 quantize_down_scale_by_float<TIn, TOut>(&in,
nullptr, &
dst, result_real_multiplier, result_offset, min, max);
307 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
309 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
311 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
313 std::vector<float_t> result_real_multiplier, int32_t result_offset, int32_t min, int32_t max);
315 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
317 std::vector<int32_t> result_fixedpoint_multiplier,
318 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
320 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
322 std::vector<int32_t> result_fixedpoint_multiplier,
323 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
325 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
327 std::vector<int32_t> result_fixedpoint_multiplier,
328 std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
330 std::vector<int32_t> result_shift, int32_t min, int32_t max);
332 std::vector<int32_t> result_shift, int32_t min, int32_t max);
334 std::vector<int32_t> result_shift, int32_t min, int32_t max);
336 std::vector<int32_t> result_shift, int32_t min, int32_t max);
int32_t asymm_rounding_divide_by_pow2(int32_t x, int exponent)
Rounded to nearest division by a power-of-two.
SimpleTensor< T1 > gemmlowp(const SimpleTensor< T2 > &a, const SimpleTensor< T3 > &b, TensorShape shape_c)
quantized, symmetric fixed-point 16-bit number
template SimpleTensor< int32_t > gemmlowp< int32_t, int8_t, int8_t >(const SimpleTensor< int8_t > &a, const SimpleTensor< int8_t > &b, TensorShape shape_c)
template SimpleTensor< int32_t > gemmlowp< int32_t, uint8_t, int8_t >(const SimpleTensor< uint8_t > &a, const SimpleTensor< int8_t > &b, TensorShape shape_c)
TensorShape shape() const override
Shape of the tensor.
decltype(strategy::transforms) typedef type
Copyright (c) 2017-2023 Arm Limited.
1 channel, 1 S32 per channel
int32_t asymm_int_mult(int32_t a, int32_t b)
Multiplication of two integers.
1 channel, 1 U32 per channel
quantized, asymmetric fixed-point 8-bit number unsigned
template SimpleTensor< int32_t > gemmlowp< int32_t, uint8_t, uint8_t >(const SimpleTensor< uint8_t > &a, const SimpleTensor< uint8_t > &b, TensorShape shape_c)
int32_t quantize_down_scale_by_fixedpoint(int32_t val, int32_t result_mult_int, int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
Quantize down the input value in range [min, max].
Simple tensor object that stores elements in a consecutive chunk of memory.
T round(T value)
Round floating-point value with half value rounding away from zero.
SimpleTensor< TOut > gemmlowp_quantize_down_scale_by_fixedpoint(const SimpleTensor< TIn > &in, std::vector< int32_t > result_fixedpoint_multiplier, std::vector< int32_t > result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
quantized, asymmetric fixed-point 8-bit number signed
DataType
Available data types.
SimpleTensor< TOut > gemmlowp_quantize_down_scale_by_float(const SimpleTensor< TIn > &in, const SimpleTensor< TIn > &bias, std::vector< float_t > result_real_multiplier, int32_t result_offset, int32_t min, int32_t max)
SimpleTensor< T_out > gemmlowp_matrix_multiply_core(const SimpleTensor< T_in > &a, const SimpleTensor< T_in_1 > &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
SimpleTensor< TOut > gemmlowp_quantize_down_scale(const SimpleTensor< TIn > &in, int32_t result_offset, std::vector< int32_t > result_mult_int, std::vector< int32_t > result_shift, int32_t min, int32_t max)