34 namespace quantization
54 int32_t *quant_multiplier,
58 const float internal_epsilon = ignore_epsilon ? 0.0f :
epsilon;
64 if(std::fabs(0.0f - multiplier) < internal_epsilon)
66 *quant_multiplier = 0;
72 const double q = std::frexp(multiplier, &shift_exp);
73 *right_shift = -1 * shift_exp;
76 if(q_fixed == fixed_point_one_Q0)
82 if(ignore_epsilon && *right_shift > 31)
90 *quant_multiplier =
static_cast<int32_t
>(q_fixed);
96 int32_t *quantized_multiplier,
104 const double q = std::frexp(multiplier, &shift_exp);
105 *left_shift = shift_exp;
108 if(q_fixed == fixed_point_one_Q0)
115 *quantized_multiplier =
static_cast<int32_t
>(q_fixed);
129 const unsigned int size = wq_info.
scale().size();
133 quant_multipliers.resize(size);
134 quant_shifts.resize(size);
136 const auto &w_scales = wq_info.
scale();
137 const float i_scale = iq_info.
scale().at(0);
138 const float o_scale = oq_info.
scale().at(0);
140 for(
unsigned int i = 0; i < size; ++i)
142 const float multiplier = i_scale * w_scales[i] / o_scale;
143 int32_t quant_multiplier = 0;
144 int32_t quant_shift = 0;
146 quant_multipliers[i] = quant_multiplier;
147 quant_shifts[i] = quant_shift;
159 int min_quant_val = 0;
160 int max_quant_val = 0;
164 min_quant_val = std::numeric_limits<uint8_t>::min();
165 max_quant_val = std::numeric_limits<uint8_t>::max();
169 min_quant_val = std::numeric_limits<int8_t>::min();
170 max_quant_val = std::numeric_limits<int8_t>::max();
173 min_quant_val = std::numeric_limits<uint16_t>::min();
174 max_quant_val = std::numeric_limits<uint16_t>::max();
177 min_quant_val = std::numeric_limits<int16_t>::min();
178 max_quant_val = std::numeric_limits<int16_t>::max();
183 return std::make_pair(min_quant_val, max_quant_val);
188 int32_t *output_multipliers_ptr,
189 int32_t *output_shifts_ptr)
195 const unsigned int num_filters = wq_info.
scale().size();
197 for(
unsigned int i = 0; i < num_filters; ++i)
199 int32_t output_multiplier = 0;
200 int32_t output_shift = 0;
201 const float multiplier = iq_info.
scale * wq_info.
scale()[i] / oq_info.
scale;
204 output_multipliers_ptr[i] = output_multiplier;
205 output_shifts_ptr[i] = output_shift;
211 bool overflow = a == b && a == std::numeric_limits<int32_t>::min();
214 int64_t ab_64 = a_64 * b_64;
215 bool is_positive_or_zero = a == 0 || b == 0 || (std::signbit(a) == std::signbit(b));
216 int32_t nudge = is_positive_or_zero ? (1 << 30) : (1 - (1 << 30));
217 int32_t ab_x2_high32 =
static_cast<int32_t
>((ab_64 + nudge) / (1ll << 31));
218 return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
223 const int32_t mask = (1 << exponent) - 1;
224 const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
225 return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
230 const auto left_shift = shift > 0 ? shift : 0;
231 const auto right_shift = shift > 0 ? 0 : -shift;
241 else if(exponent < 0)
247 constexpr
auto min = std::numeric_limits<int32_t>::min();
248 constexpr
auto max = std::numeric_limits<int32_t>::max();
249 const auto width =
sizeof(int32_t) * 8;
251 const int32_t threshold = ((1 << (width - 1 - exponent)) - 1);
252 bool pos_mask = v > threshold;
253 bool neg_mask = v < -threshold;
254 int32_t result = v << exponent;
255 result = pos_mask ? max : result;
256 result = neg_mask ? min : result;
268 output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
275 while(input >= (1 << 29))
281 const uint32_t max_left_shift_bits = __builtin_clz(static_cast<uint32_t>(input)) - 1;
282 const uint32_t max_left_shift_bits_pairs = max_left_shift_bits / 2;
283 const uint32_t left_shift_bit_pairs = max_left_shift_bits_pairs - 1;
284 output_shift -= left_shift_bit_pairs;
285 input <<= 2 * left_shift_bit_pairs;
288 using FixedPointRawType = int32_t;
289 constexpr uint32_t fixedpoint_position = 3;
290 constexpr uint32_t fixedpoint_int_position =
sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
291 using FixedPoint3 = FixedPointRawType;
292 using FixedPoint0 = FixedPointRawType;
295 const FixedPoint3 fixedpoint_input = (input >> 1);
297 const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
300 FixedPoint3 x = 0x1 << fixedpoint_int_position;
303 auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType
b) -> FixedPointRawType
309 auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
311 const uint32_t exponent = src_bit - dst_bit;
316 constexpr int32_t num_iteration = 5;
317 for(int32_t i = 0; i < num_iteration; ++i)
319 const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
320 x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
324 const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
325 x = fixed_point_mul(fixedpoint_half_sqrt_2, x);
329 output_inv_sqrt <<= -output_shift;
333 output_shift *= reverse_shift;
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
quantized, symmetric fixed-point 16-bit number
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
quantized, asymmetric fixed-point 16-bit number
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
Calculate quantized representation of multiplier having value greater than one.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Copyright (c) 2017-2021 Arm Limited.
std::vector< int32_t > gemmlowp_shifts
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Quantization information.
std::pair< int, int > get_min_max_values_from_quantized_data_type(DataType data_type)
Get minimum and maximum values for the input quantized data type.
void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
Compute quantized multiplier and shift for the inverse square root of input.
int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b)
Compute multiplication of two integers.
Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, const QuantizationInfo &wq_info, const QuantizationInfo &oq_info, GEMMLowpOutputStageInfo &stage_info)
Calculate quantized representation of per-channel multipliers.
int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t shift)
Compute the value multiplied by given quantized multiplier and shift.
quantized, asymmetric fixed-point 8-bit number unsigned
std::vector< int32_t > gemmlowp_multipliers
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowp output stage info.
const std::vector< float > & scale() const
Scale vector accessor.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
quantized, symmetric fixed-point 8-bit number
int32_t rounding_divide_by_pow2(int32_t x, int exponent)
Round to the nearest division by a power-of-two using exponent, copied from NEMath.
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
constexpr int64_t fixed_point_one_Q0
T round(T value)
Round floating-point value with half value rounding away from zero.
int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
Compute the value multiplied the power-of-two.
quantized, asymmetric fixed-point 8-bit number signed
Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier with value less than one.
DataType
Available data types.