24 #ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H 25 #define SRC_CORE_NEON_KERNELS_QUANTIZED_H 40 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>
::type 47 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>
::type 59 return vcvtq_u32_f32(values);
65 return vcvtq_s32_f32(values);
74 return vcvtq_f32_u32(values);
80 return vcvtq_f32_s32(values);
83 template <
typename Tout>
84 inline Tout
vrequantize_pooling_with_scale(
const float32x4x4_t &acc,
const float quant_rescale,
const float scale_pooling,
const int32_t new_offset);
89 const float new_scale = quant_rescale / scale_pooling;
96 const float new_scale = quant_rescale / scale_pooling;
100 template <
typename Tin,
typename Tout>
106 const float32x4x4_t acc =
109 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
110 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
111 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
112 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
121 const float32x4x4_t acc =
124 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
125 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
126 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
127 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
133 template <
typename T>
139 const float32x4x2_t acc =
142 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
143 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
152 const float32x4x2_t acc =
155 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
156 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
163 const int pad_x,
const int pad_y,
const int stride_x,
const int stride_y)
168 int start_x =
id[
idx_width] * stride_x - pad_x;
169 int start_y =
id[
idx_height] * stride_y - pad_y;
171 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
172 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
175 start_x = std::max(0, start_x);
176 start_y = std::max(0, start_y);
178 return 1.f / ((end_y - start_y) * (end_x - start_x));
181 template <
typename T>
186 const int window_start_x = window.
x().
start();
187 const int window_end_x = window.
x().
end();
188 const int window_step_x = 16;
189 const int window_half_step_x = window_step_x / 2;
191 Window window_out = window;
212 int pool_stride_y = 0;
217 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
221 const float quant_rescale = dst_qinfo.
scale / src_qinfo.
scale;
224 const int32_t new_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / quant_rescale);
226 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
227 const int32_t requant_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
233 const int idx_height =
id.z() * pool_stride_y;
234 const int pool_limit_y = pool_pad_top -
idx_height;
235 const int pool_limit_x = pool_pad_left -
idx_width;
237 const int pool_start_y = std::max(0, window_src.
z().
start() + pool_limit_y);
238 const int pool_end_y = std::min(pool_size_y, window_src.
z().
end() + pool_limit_y);
239 const int pool_start_x = std::max(0, window_src.
y().
start() + pool_limit_x);
240 const int pool_end_x = std::min(pool_size_x, window_src.
y().
end() + pool_limit_x);
242 int x_off = window_start_x;
243 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
253 const float scale =
calculate_avg_scale(pool_info.
exclude_padding,
DataLayout::NHWC,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
257 for(
int y = pool_start_y; y < pool_end_y; ++y)
259 for(
int x = pool_start_x; x < pool_end_x; ++x)
261 const q8x16_t data =
wrapper::vloadq(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 273 if(src_qinfo != dst_qinfo)
275 const float32x4x4_t vres =
284 const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale,
scale, new_offset);
291 const float32x4_t scale_v = vdupq_n_f32(scale);
309 for(
int y = pool_start_y; y < pool_end_y; ++y)
311 for(
int x = pool_start_x; x < pool_end_x; ++x)
313 const q8x16_t data =
wrapper::vloadq(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 328 for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
331 for(
int y = pool_start_y; y < pool_end_y; ++y)
333 for(
int x = pool_start_x; x < pool_end_x; ++x)
335 const q8x8_t data =
wrapper::vload(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 343 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
348 for(; x_off < window_end_x; ++x_off)
352 q32_t res =
static_cast<q32_t
>(0.f);
355 const float scale =
calculate_avg_scale(pool_info.
exclude_padding,
DataLayout::NHWC,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
359 for(
int y = pool_start_y; y < pool_end_y; ++y)
361 for(
int x = pool_start_x; x < pool_end_x; ++x)
363 const T data = *(
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 369 if(src_qinfo != dst_qinfo)
371 const float res_f =
static_cast<float>(res);
372 const float new_scale = quant_rescale /
scale;
376 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = requantized_dst;
381 res =
static_cast<T
>(0.5f +
static_cast<float>(res) * scale);
384 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = res;
389 T res = std::numeric_limits<T>::min();
391 for(
int y = pool_start_y; y < pool_end_y; ++y)
393 for(
int x = pool_start_x; x < pool_end_x; ++x)
395 const T data = *(
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 397 res = std::max(res, data);
402 if(src_qinfo != dst_qinfo)
404 const float res_f =
static_cast<float>(res);
405 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
409 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = res;
418 #if defined(ENABLE_NCHW_KERNELS) 419 template <
typename T,
typename TVec>
420 inline void scale_vector_q16x8(
bool exclude_padding, TVec &v,
const Coordinates &
id,
int id_offset,
int step,
421 const int pool_size,
const int upper_bound_w,
const int upper_bound_h,
422 const int pad_x,
const int pad_y,
const int stride_x,
const int stride_y)
424 int start_x = (
id.x() + id_offset) * stride_x - pad_x;
425 int start_y =
id.y() * stride_y - pad_y;
426 const int end_y = std::min(start_y + pool_size, upper_bound_h);
429 start_y = std::max(0, start_y);
432 std::array<T, 8> elems =
446 for(
auto &el : elems)
448 int c_start_x = start_x;
449 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
452 c_start_x = std::max(0, c_start_x);
454 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
456 start_x += step * stride_x;
469 template <
typename T>
479 using q8x8x2_t =
typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>
::type;
485 constexpr
int pool_size = 2;
487 int pool_stride_y = 0;
496 const T *
const src_top_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
497 const T *
const src_bottom_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
499 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
503 const bool have_different_qinfo = src_qinfo != dst_qinfo;
505 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
506 const int32_t requant_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
513 q8x8_t lower_res = {};
514 q8x8_t upper_res = {};
522 const q16x8x2_t vrsum =
537 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, res_lower, id, 0, scale_step_x,
543 if(pool_stride_x == 1)
546 const q16x8x2_t vrsum_shifted =
560 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, res_upper, id, 1, 2,
568 const q8x16_t max_data =
wrapper::vmax(top_data, bottom_data);
570 if(pool_stride_x == 1)
577 if(have_different_qinfo)
579 const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
585 if(pool_stride_x == 1)
587 const q8x8x2_t res = { { lower_res, upper_res } };
598 template <
typename T>
608 using q8x8x2_t =
typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>
::type;
613 constexpr
int pool_size = 3;
619 int pool_stride_y = 0;
627 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
628 const int32_t requant_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
631 const T *
const src_top_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
632 const T *
const src_middle_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
633 const T *
const src_bottom_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
651 const q16x8x2_t vrsum =
658 const q16x8x2_t vrsum_shifted_1 =
665 const q16x8x2_t vrsum_shifted_2 =
673 q16x8x2_t final_sum =
680 if(pool_stride_x == 2)
694 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, res, id, 0, 1,
702 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, final_sum.val[0], id, 0, 1,
706 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, final_sum.val[1], id, 8, 1,
719 if(pool_stride_x == 2)
722 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
732 if(pool_stride_x == 1)
734 if(src_qinfo != dst_qinfo)
742 if(src_qinfo != dst_qinfo)
744 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
752 template <
typename T>
773 int pool_stride_y = 0;
783 T res = std::numeric_limits<T>::min();
791 const float scale =
calculate_avg_scale(pool_info.
exclude_padding,
DataLayout::NCHW,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
795 for(
int y = 0; y < pool_size_y; ++y)
798 for(; x <= (pool_size_x - 8); x += 8)
800 const q8x8_t data =
wrapper::vload(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
x()) + (y - pool_pad_top) *
static_cast<int> 808 for(; x < pool_size_x; ++x)
810 T data = *(
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
x()) + (y - pool_pad_top) *
static_cast<int> 827 for(
int y = 0; y < pool_size_y; ++y)
830 for(; x <= (pool_size_x - 8); x += 8)
832 const q8x8_t data =
wrapper::vload(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
x()) + (y - pool_pad_top) *
static_cast<int> 837 for(; x < pool_size_x; ++x)
839 const T data = *(
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
x()) + (y - pool_pad_top) *
static_cast<int> 841 res = std::max(res, data);
855 *(
reinterpret_cast<T *
>(out.
ptr())) = res;
863 #endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
uint32x2_t vmovn(const uint64x2_t &a)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given an unsigned 8-bit asymmetric quantization scheme.
uint8x16_t vloadq(const uint8_t *ptr)
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
const DataLayout data_layout
uint8x8_t vext_2(uint8x8_t value_a, uint8x8_t value_b)
Describe one of the image's dimensions with a start, end and step.
unsigned int pad_top() const
Get the top padding.
constexpr const Dimension & z() const
Alias to access the third dimension of the window.
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
size_t height
Height of the image region or rectangle.
float32x4_t vcvtq_f32_q32(T values)
typename promote< T >::type promote_t
Get promoted type.
T x() const
Alias to access the size of the first dimension.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
uint8x8_t vext_1(uint8x8_t value_a, uint8x8_t value_b)
Create the appropriate Neon vector given its type and size in terms of elements.
T vcvtq_q32_f32(float32x4_t values)
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
std::enable_if< std::is_same< T, int8_t >::value, int8_t >::type quantize(float val, const UniformQuantizationInfo &info)
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
T z() const
Alias to access the size of the third dimension.
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo)
Pooling Layer Information struct.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
uint8x8_t vtbl(const uint8x8x2_t &a, const uint8x8_t &b)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
uint16x8_t vaddl(const uint8x8_t &a, const uint8x8_t &b)
float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
Num samples, channels, height, width.
uint8x8_t vgethigh(const uint8x16_t val)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
PadStrideInfo pad_stride_info
size_t width
Width of the image region or rectangle.
T round(T value)
Round floating-point value with half value rounding away from zero.
Num samples, height, width, channels.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
T y() const
Alias to access the size of the second dimension.
Includes all wrapper headers at once.
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
constexpr int end() const
Return the end of the dimension.
unsigned int pad_bottom() const
Get the bottom padding.
Iterator updated by execute_window_loop for each window element.
uint16x8_t vmovl(const uint8x8_t &a)
unsigned int pad_left() const
Get the left padding.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
DataLayout
[DataLayout enum definition]
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.