34 namespace quantization
54 int32_t *quant_multiplier,
58 const float internal_epsilon = ignore_epsilon ? 0.0f :
epsilon;
64 if(std::fabs(0.0f - multiplier) < internal_epsilon)
66 *quant_multiplier = 0;
72 const double q = std::frexp(multiplier, &shift_exp);
73 *right_shift = -1 * shift_exp;
82 if(ignore_epsilon && *right_shift > 31)
90 *quant_multiplier = static_cast<int32_t>(q_fixed);
96 int32_t *quantized_multiplier,
104 const double q = std::frexp(multiplier, &shift_exp);
105 *left_shift = shift_exp;
115 *quantized_multiplier = static_cast<int32_t>(q_fixed);
129 const unsigned int size = wq_info.
scale().size();
133 quant_multipliers.resize(size);
134 quant_shifts.resize(size);
136 const auto &w_scales = wq_info.
scale();
137 const float i_scale = iq_info.
scale().at(0);
138 const float o_scale = oq_info.
scale().at(0);
140 for(
unsigned int i = 0; i < size; ++i)
142 const float multiplier = i_scale * w_scales[i] / o_scale;
143 int32_t quant_multiplier = 0;
144 int32_t quant_shift = 0;
146 quant_multipliers[i] = quant_multiplier;
147 quant_shifts[i] = quant_shift;
159 int min_quant_val = 0;
160 int max_quant_val = 0;
164 min_quant_val = std::numeric_limits<uint8_t>::min();
165 max_quant_val = std::numeric_limits<uint8_t>::max();
169 min_quant_val = std::numeric_limits<int8_t>::min();
170 max_quant_val = std::numeric_limits<int8_t>::max();
173 min_quant_val = std::numeric_limits<uint16_t>::min();
174 max_quant_val = std::numeric_limits<uint16_t>::max();
177 min_quant_val = std::numeric_limits<int16_t>::min();
178 max_quant_val = std::numeric_limits<int16_t>::max();
183 return std::make_pair(min_quant_val, max_quant_val);
188 unsigned int idx_ofms,
189 int32_t *output_multipliers_ptr,
190 int32_t *output_shifts_ptr)
198 for(
unsigned int i = 0; i < num_filters; ++i)
200 int32_t output_multiplier = 0;
201 int32_t output_shift = 0;
202 const float multiplier = iq_info.
scale * wq_info.
scale()[i] / oq_info.
scale;
205 output_multipliers_ptr[i] = output_multiplier;
206 output_shifts_ptr[i] = output_shift;
212 bool overflow = a ==
b && a == std::numeric_limits<int32_t>::min();
215 int64_t ab_64 = a_64 * b_64;
216 bool is_positive_or_zero = a == 0 ||
b == 0 || (std::signbit(a) == std::signbit(
b));
217 int32_t nudge = is_positive_or_zero ? (1 << 30) : (1 - (1 << 30));
218 int32_t ab_x2_high32 = static_cast<int32_t>((ab_64 + nudge) / (1ll << 31));
219 return overflow ? std::numeric_limits<int32_t>::max() : ab_x2_high32;
224 const int32_t mask = (1 << exponent) - 1;
225 const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
226 return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
231 const auto left_shift = shift > 0 ? shift : 0;
232 const auto right_shift = shift > 0 ? 0 : -shift;
242 else if(exponent < 0)
248 constexpr
auto min = std::numeric_limits<int32_t>::min();
249 constexpr
auto max = std::numeric_limits<int32_t>::max();
250 const auto width =
sizeof(int32_t) * 8;
252 const int32_t threshold = ((1 << (width - 1 - exponent)) - 1);
253 bool pos_mask = v > threshold;
254 bool neg_mask = v < -threshold;
255 int32_t result = v << exponent;
256 result = pos_mask ? max : result;
257 result = neg_mask ? min : result;
269 output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
276 while(
input >= (1 << 29))
282 const uint32_t max_left_shift_bits = __builtin_clz(static_cast<uint32_t>(
input)) - 1;
283 const uint32_t max_left_shift_bits_pairs = max_left_shift_bits / 2;
284 const uint32_t left_shift_bit_pairs = max_left_shift_bits_pairs - 1;
285 output_shift -= left_shift_bit_pairs;
286 input <<= 2 * left_shift_bit_pairs;
289 using FixedPointRawType = int32_t;
290 constexpr uint32_t fixedpoint_position = 3;
291 constexpr uint32_t fixedpoint_int_position =
sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
292 using FixedPoint3 = FixedPointRawType;
293 using FixedPoint0 = FixedPointRawType;
296 const FixedPoint3 fixedpoint_input = (
input >> 1);
298 const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
301 FixedPoint3 x = 0x1 << fixedpoint_int_position;
304 auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType
b) -> FixedPointRawType
310 auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
312 const uint32_t exponent = src_bit - dst_bit;
317 constexpr int32_t num_iteration = 5;
318 for(int32_t i = 0; i < num_iteration; ++i)
320 const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
321 x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
325 const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
326 x = fixed_point_mul(fixedpoint_half_sqrt_2, x);
330 output_inv_sqrt <<= -output_shift;
334 output_shift *= reverse_shift;
int32_t gemmlowp_multiplier
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
quantized, symmetric fixed-point 16-bit number
void compute_quantized_multipliers_and_shifts(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, unsigned int idx_ofms, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr)
Compute quantized per-channel multipliers and shifts.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
quantized, asymmetric fixed-point 16-bit number
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
Calculate quantized representation of multiplier having value greater than one.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Copyright (c) 2017-2021 Arm Limited.
std::vector< int32_t > gemmlowp_shifts
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
Quantization information.
std::pair< int, int > get_min_max_values_from_quantized_data_type(DataType data_type)
Get minimum and maximum values for the input quantized data type.
void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
Compute quantized multiplier and shift for the inverse square root of input.
int32_t saturating_rounding_doubling_highmul(int32_t a, int32_t b)
Compute multiplication of two integers.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
Status calculate_quantized_multipliers(const QuantizationInfo &iq_info, const QuantizationInfo &wq_info, const QuantizationInfo &oq_info, GEMMLowpOutputStageInfo &stage_info)
Calculate quantized representation of per-channel multipliers.
int32_t multiply_by_quantized_multiplier(int32_t input, int32_t qmul, int32_t shift)
Compute the value multiplied by given quantized multiplier and shift.
quantized, asymmetric fixed-point 8-bit number unsigned
std::vector< int32_t > gemmlowp_multipliers
GEMMLowp output stage multiplier used for quantizing to QASYMM8.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
GEMMLowp output stage info.
const std::vector< float > & scale() const
Scale vector accessor.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
quantized, symmetric fixed-point 8-bit number
int32_t rounding_divide_by_pow2(int32_t x, int exponent)
Round to the nearest division by a power-of-two using exponent, copied from NEMath.
int32_t gemmlowp_shift
GEMMLowp output stage shift used for quantizing to uint8.
constexpr int64_t fixed_point_one_Q0
T round(T value)
Round floating-point value with half value rounding away from zero.
int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
Compute the value multiplied the power-of-two.
quantized, asymmetric fixed-point 8-bit number signed
Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier with value less than one.
DataType
Available data types.