34 namespace quantization
54 int32_t *quant_multiplier,
58 const float internal_epsilon = ignore_epsilon ? 0.0f :
epsilon;
66 const double q = std::frexp(multiplier, &shift_exp);
67 *right_shift = -1 * shift_exp;
70 if(q_fixed == fixed_point_one_Q0)
76 if(ignore_epsilon && *right_shift > 31)
84 *quant_multiplier =
static_cast<int32_t
>(q_fixed);
90 int32_t *quantized_multiplier,
98 const double q = std::frexp(multiplier, &shift_exp);
99 *left_shift = shift_exp;
102 if(q_fixed == fixed_point_one_Q0)
109 *quantized_multiplier =
static_cast<int32_t
>(q_fixed);
123 const unsigned int size = wq_info.
scale().size();
127 quant_multipliers.resize(size);
128 quant_shifts.resize(size);
130 const auto &w_scales = wq_info.
scale();
131 const float i_scale = iq_info.
scale().at(0);
132 const float o_scale = oq_info.
scale().at(0);
134 for(
unsigned int i = 0; i < size; ++i)
136 const float multiplier = i_scale * w_scales[i] / o_scale;
137 int32_t quant_multiplier = 0;
138 int32_t quant_shift = 0;
140 quant_multipliers[i] = quant_multiplier;
141 quant_shifts[i] = quant_shift;
153 int min_quant_val = 0;
154 int max_quant_val = 0;
158 min_quant_val = std::numeric_limits<uint8_t>::min();
159 max_quant_val = std::numeric_limits<uint8_t>::max();
163 min_quant_val = std::numeric_limits<int8_t>::min();
164 max_quant_val = std::numeric_limits<int8_t>::max();
167 min_quant_val = std::numeric_limits<uint16_t>::min();
168 max_quant_val = std::numeric_limits<uint16_t>::max();
171 min_quant_val = std::numeric_limits<int16_t>::min();
172 max_quant_val = std::numeric_limits<int16_t>::max();
177 return std::make_pair(min_quant_val, max_quant_val);
182 int32_t *output_multipliers_ptr,
183 int32_t *output_shifts_ptr)
189 const unsigned int num_filters = wq_info.
scale().size();
191 for(
unsigned int i = 0; i < num_filters; ++i)
193 int32_t output_multiplier = 0;
194 int32_t output_shift = 0;
195 const float multiplier = iq_info.
scale * wq_info.
scale()[i] / oq_info.
scale;
198 output_multipliers_ptr[i] = output_multiplier;
199 output_shifts_ptr[i] = output_shift;
205 bool overflow = a == b && a == std::numeric_limits<int32_t>::min();
208 int64_t ab_64 = a_64 * b_64;
210 int32_t nudge = is_positive_or_zero ? (1 << 30) : (1 - (1 << 30));
211 int32_t ab_x2_high32 =
static_cast<int32_t
>((ab_64 + nudge) / (1ll << 31));
212 return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
217 const int32_t mask = (1 << exponent) - 1;
218 const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
219 return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
224 const auto left_shift = shift > 0 ? shift : 0;
225 const auto right_shift = shift > 0 ? 0 : -shift;
235 else if(exponent < 0)
241 constexpr
auto min = std::numeric_limits<int32_t>::min();
242 constexpr
auto max = std::numeric_limits<int32_t>::max();
243 const auto width =
sizeof(int32_t) * 8;
245 const int32_t threshold = ((1 << (width - 1 - exponent)) - 1);
246 bool pos_mask = v > threshold;
247 bool neg_mask = v < -threshold;
248 int32_t result = v << exponent;
249 result = pos_mask ? max : result;
250 result = neg_mask ? min : result;
262 output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
269 while(input >= (1 << 29))
275 const uint32_t max_left_shift_bits = __builtin_clz(static_cast<uint32_t>(input)) - 1;
276 const uint32_t max_left_shift_bits_pairs = max_left_shift_bits / 2;
277 const uint32_t left_shift_bit_pairs = max_left_shift_bits_pairs - 1;
278 output_shift -= left_shift_bit_pairs;
279 input <<= 2 * left_shift_bit_pairs;
282 using FixedPointRawType = int32_t;
283 constexpr uint32_t fixedpoint_position = 3;
284 constexpr uint32_t fixedpoint_int_position =
sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
285 using FixedPoint3 = FixedPointRawType;
286 using FixedPoint0 = FixedPointRawType;
289 const FixedPoint3 fixedpoint_input = (input >> 1);
291 const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
294 FixedPoint3 x = 0x1 << fixedpoint_int_position;
297 auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType
b) -> FixedPointRawType
303 auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
305 const uint32_t exponent = src_bit - dst_bit;
310 constexpr int32_t num_iteration = 5;
311 for(int32_t i = 0; i < num_iteration; ++i)
313 const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
314 x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
318 const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
319 x = fixed_point_mul(fixedpoint_half_sqrt_2, x);
323 output_inv_sqrt <<= -output_shift;
327 output_shift *= reverse_shift;
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
quantized, symmetric fixed-point 16-bit number
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
quantized, asymmetric fixed-point 16-bit number
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
Calculate quantized representation of multiplier having value greater than one.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Copyright (c) 2017-2021 Arm Limited.
std::vector< int32_t > gemmlowp_shifts
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Quantization information.
std::pair< int, int > get_min_max_values_from_quantized_data_type(DataType data_type)
Get minimum and maximum values for the input quantized data type.
void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
Compute quantized multiplier and shift for the inverse square root of input.
int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b)
Compute multiplication of two integers.
Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, const QuantizationInfo &wq_info, const QuantizationInfo &oq_info, GEMMLowpOutputStageInfo &stage_info)
Calculate quantized representation of per-channel multipliers.
int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t shift)
Compute the value multiplied by given quantized multiplier and shift.
quantized, asymmetric fixed-point 8-bit number unsigned
std::vector< int32_t > gemmlowp_multipliers
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowp output stage info.
const std::vector< float > & scale() const
Scale vector accessor.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
quantized, symmetric fixed-point 8-bit number
int32_t rounding_divide_by_pow2(int32_t x, int exponent)
Round to the nearest division by a power-of-two using exponent, copied from NEMath.
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
constexpr int64_t fixed_point_one_Q0
T round(T value)
Round floating-point value with half value rounding away from zero.
int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
Compute the value multiplied the power-of-two.
quantized, asymmetric fixed-point 8-bit number signed
Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier with value less than one.
DataType
Available data types.