24 #ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
25 #define SRC_CORE_NEON_KERNELS_QUANTIZED_H
52 const int window_start_x = window.
x().
start();
53 const int window_end_x = window.
x().
end();
54 const int window_step_x = 16;
55 const int window_half_step_x = window_step_x / 2;
57 Window window_out = window;
77 int pool_stride_x = 0;
78 int pool_stride_y = 0;
80 const int upper_bound_w =
src->info()->dimension(1) + (pool_info.
exclude_padding ? 0 : pool_pad_right);
81 const int upper_bound_h =
src->info()->dimension(2) + (pool_info.
exclude_padding ? 0 : pool_pad_bottom);
83 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
87 const float quant_rescale = dst_qinfo.
scale / src_qinfo.
scale;
90 const int32_t new_offset =
91 dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / quant_rescale);
93 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
94 const int32_t requant_offset =
95 dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
102 const int idx_width =
id.y() * pool_stride_x;
103 const int idx_height =
id.z() * pool_stride_y;
104 const int pool_limit_y = pool_pad_top -
idx_height;
105 const int pool_limit_x = pool_pad_left -
idx_width;
107 const int pool_start_y = std::max(0, window_src.
z().
start() + pool_limit_y);
108 const int pool_end_y = std::min(pool_size_y, window_src.
z().
end() + pool_limit_y);
109 const int pool_start_x = std::max(0, window_src.
y().
start() + pool_limit_x);
110 const int pool_end_x = std::min(pool_size_x, window_src.
y().
end() + pool_limit_x);
112 int x_off = window_start_x;
113 for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
123 const float scale = calculate_avg_scale_pool2d(
125 upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
128 for (
int y = pool_start_y; y < pool_end_y; ++y)
130 for (
int x = pool_start_x; x < pool_end_x; ++x)
133 reinterpret_cast<const T *
>(
135 (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) +
136 (y - pool_pad_top) *
static_cast<int>(
src->info()->strides_in_bytes().z())) +
148 if (src_qinfo != dst_qinfo)
150 const float32x4x4_t vres = {{
151 vcvtq_f32_q32(vres1),
152 vcvtq_f32_q32(vres2),
153 vcvtq_f32_q32(vres3),
154 vcvtq_f32_q32(vres4),
156 const auto requantized_dst =
157 vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale,
scale, new_offset);
165 const float32x4_t scale_v = vdupq_n_f32(
scale);
167 vres1 = vcvtq_q32_f32<q32x4_t>(
wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
168 vres2 = vcvtq_q32_f32<q32x4_t>(
wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
169 vres3 = vcvtq_q32_f32<q32x4_t>(
wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
170 vres4 = vcvtq_q32_f32<q32x4_t>(
wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
185 for (
int y = pool_start_y; y < pool_end_y; ++y)
187 for (
int x = pool_start_x; x < pool_end_x; ++x)
190 reinterpret_cast<const T *
>(
192 (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) +
193 (y - pool_pad_top) *
static_cast<int>(
src->info()->strides_in_bytes().z())) +
201 (src_qinfo != dst_qinfo)
210 for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
213 for (
int y = pool_start_y; y < pool_end_y; ++y)
215 for (
int x = pool_start_x; x < pool_end_x; ++x)
218 reinterpret_cast<const T *
>(
220 (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) +
221 (y - pool_pad_top) *
static_cast<int>(
src->info()->strides_in_bytes().z())) +
229 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
234 for (; x_off < window_end_x; ++x_off)
238 q32_t res =
static_cast<q32_t
>(0.f);
241 const float scale = calculate_avg_scale_pool2d(
243 upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
246 for (
int y = pool_start_y; y < pool_end_y; ++y)
248 for (
int x = pool_start_x; x < pool_end_x; ++x)
251 *(
reinterpret_cast<const T *
>(
253 (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) +
254 (y - pool_pad_top) *
static_cast<int>(
src->info()->strides_in_bytes().z())) +
260 if (src_qinfo != dst_qinfo)
262 const float res_f =
static_cast<float>(res);
263 const float new_scale = quant_rescale /
scale;
267 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = requantized_dst;
272 res =
static_cast<T
>(0.5f +
static_cast<float>(res) *
scale);
275 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = res;
280 T res = std::numeric_limits<T>::min();
282 for (
int y = pool_start_y; y < pool_end_y; ++y)
284 for (
int x = pool_start_x; x < pool_end_x; ++x)
287 *(
reinterpret_cast<const T *
>(
289 (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) +
290 (y - pool_pad_top) *
static_cast<int>(
src->info()->strides_in_bytes().z())) +
292 res = std::max(res, data);
297 if (src_qinfo != dst_qinfo)
299 const float res_f =
static_cast<float>(res);
300 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
304 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = res;
312 #if defined(ENABLE_NCHW_KERNELS)
313 template <
typename T,
typename TVec>
314 inline void scale_vector_q16x8(
bool exclude_padding,
320 const int upper_bound_w,
321 const int upper_bound_h,
327 int start_x = (
id.x() + id_offset) * stride_x - pad_x;
328 int start_y =
id.y() * stride_y - pad_y;
329 const int end_y = std::min(start_y + pool_size, upper_bound_h);
332 start_y = std::max(0, start_y);
335 std::array<T, 8> elems = {{
346 for (
auto &el : elems)
348 int c_start_x = start_x;
349 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
352 c_start_x = std::max(0, c_start_x);
354 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
356 start_x +=
step * stride_x;
369 template <
typename T>
370 auto load16_boundary_aware(
371 int srcw,
int srch,
int pad_l,
int pad_r,
int pad_t,
int pad_b,
int x,
int y,
const T *ptr, T fval)
376 const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
377 for (
int i = 0; i < 16; i++)
379 if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
391 template <
typename T,
typename V,
bool de
interleave>
392 inline void write16_boundary_aware(
int x,
int dst_w,
const V &lower,
const V &upper, T *ptr)
396 for (
int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
398 *(ptr + i * 2) = lower[i];
400 for (
int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
402 *(ptr + 1 + i * 2) = upper[i];
407 for (
int i = 0; i < 8 && (i + x) < dst_w; ++i)
409 *(ptr + i) = lower[i];
411 for (
int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
413 *(ptr + i + 8) = upper[i];
418 template <
typename T,
typename V>
419 inline void write8_boundary_aware(
int x,
int dst_w,
const V &v, T *ptr)
421 for (
int i = 0; i < 8 && (i + x) < dst_w; ++i)
427 template <
typename T>
428 void pooling2_quantized_neon_nchw(
const ITensor *
src,
431 PoolingLayerInfo &pool_info,
432 const Window &window_src,
433 const Window &window)
436 Iterator in(
src, window_src);
437 Iterator out(dst0, window);
442 using q16_t =
typename wrapper::traits::promote_t<T>;
447 constexpr
int pool_size = 2;
448 int pool_stride_x = 0;
449 int pool_stride_y = 0;
450 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
451 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
452 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
453 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
454 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
455 const int upper_bound_w =
src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
456 const int upper_bound_h =
src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
457 const T *
const src_top_ptr =
reinterpret_cast<const T *
>(
458 src->ptr_to_element(Coordinates(-
static_cast<int>(pool_pad_left), -
static_cast<int>(pool_pad_top))));
459 const T *
const src_bottom_ptr =
reinterpret_cast<const T *
>(
460 src->ptr_to_element(Coordinates(-
static_cast<int>(pool_pad_left), -
static_cast<int>(pool_pad_top) + 1)));
461 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
462 const UniformQuantizationInfo src_qinfo =
src->info()->quantization_info().uniform();
463 const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
464 const bool have_different_qinfo = src_qinfo != dst_qinfo;
466 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
467 const int32_t requant_offset =
468 dst_qinfo.offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.offset) / requant_scale);
469 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
470 const int src_w =
src->info()->dimension(0);
471 const int src_h =
src->info()->dimension(1);
472 const int dst_w = dst0->info()->dimension(0);
474 const T fill_value = (pool_info.pool_type ==
PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
478 [&](
const Coordinates &
id)
480 const auto x_val =
id.x() * pool_stride_x;
481 const auto y_val_0 =
id.y() * pool_stride_y;
482 const auto y_val_1 = (
id.y() * pool_stride_y) + 1;
485 load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
486 y_val_0,
reinterpret_cast<const T *
>(src_top_ptr + in.offset()), fill_value);
488 load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
489 y_val_1,
reinterpret_cast<const T *
>(src_bottom_ptr + in.offset()), fill_value);
491 q8x8_t lower_res = {};
492 q8x8_t upper_res = {};
496 const q16x8x2_t top_data_q16 = {
498 const q16x8x2_t bottom_data_q16 = {
502 const q16x8x2_t vrsum = {{
514 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower,
id, 0, scale_step_x, pool_size,
515 upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
516 pool_stride_x, pool_stride_y);
520 if (pool_stride_x == 1)
523 const q16x8x2_t vrsum_shifted = {
533 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper,
id, 1, 2, pool_size,
534 upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
535 pool_stride_x, pool_stride_y);
541 const q8x16_t max_data =
wrapper::vmax(top_data, bottom_data);
543 if (pool_stride_x == 1)
550 if (have_different_qinfo)
552 const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
556 auto out_ptr =
reinterpret_cast<T *
>(out.ptr());
558 if (pool_stride_x == 1)
560 write16_boundary_aware<T, q8x8_t, true>(
id.x(), dst_w, lower_res, upper_res, out_ptr);
564 write8_boundary_aware<T, q8x8_t>(
id.x(), dst_w, lower_res, out_ptr);
570 template <
typename T>
571 void pooling3_quantized_neon_nchw(
const ITensor *
src,
574 PoolingLayerInfo &pool_info,
575 const Window &window_src,
576 const Window &window)
579 Iterator in(
src, window_src);
580 Iterator out(dst0, window);
585 using q8x8x2_t =
typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>
::type;
586 using q16_t =
typename wrapper::traits::promote_t<T>;
590 constexpr
int pool_size = 3;
591 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
592 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
593 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
594 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
595 int pool_stride_x = 0;
596 int pool_stride_y = 0;
597 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
598 const int upper_bound_w =
src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
599 const int upper_bound_h =
src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
601 const UniformQuantizationInfo &src_qinfo =
src->info()->quantization_info().uniform();
602 const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
604 const float requant_scale = dst_qinfo.scale / src_qinfo.scale;
605 const int32_t requant_offset =
606 dst_qinfo.offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.offset) / requant_scale);
607 const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
609 const T *
const src_top_ptr =
reinterpret_cast<const T *
>(
610 src->ptr_to_element(Coordinates(-
static_cast<int>(pool_pad_left), -
static_cast<int>(pool_pad_top))));
611 const T *
const src_middle_ptr =
reinterpret_cast<const T *
>(
612 src->ptr_to_element(Coordinates(-
static_cast<int>(pool_pad_left), -
static_cast<int>(pool_pad_top) + 1)));
613 const T *
const src_bottom_ptr =
reinterpret_cast<const T *
>(
614 src->ptr_to_element(Coordinates(-
static_cast<int>(pool_pad_left), -
static_cast<int>(pool_pad_top) + 2)));
616 const int src_w =
src->info()->dimension(0);
617 const int src_h =
src->info()->dimension(1);
618 const T fill_value = (pool_info.pool_type ==
PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
619 const int dst_w = dst0->info()->dimension(0);
623 [&](
const Coordinates &
id)
625 const auto x_val =
id.x() * pool_stride_x;
626 const auto y_val_0 =
id.y() * pool_stride_y;
627 const auto y_val_1 = (
id.y() * pool_stride_y) + 1;
628 const auto y_val_2 = (
id.y() * pool_stride_y) + 2;
631 load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
632 y_val_0,
reinterpret_cast<const T *
>(src_top_ptr + in.offset()), fill_value);
634 load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
635 y_val_1,
reinterpret_cast<const T *
>(src_middle_ptr + in.offset()), fill_value);
637 load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
638 y_val_2,
reinterpret_cast<const T *
>(src_bottom_ptr + in.offset()), fill_value);
646 const q16x8x2_t top_data_q16 = {
648 const q16x8x2_t middle_data_q16 = {
650 const q16x8x2_t bottom_data_q16 = {
654 const q16x8x2_t vrsum = {{
658 const q16x8x2_t vrsum_shifted_1 = {
660 const q16x8x2_t vrsum_shifted_2 = {
663 q16x8x2_t final_sum = {{
667 if (pool_stride_x == 2)
676 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res,
id, 0, 1, pool_size,
677 upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
678 pool_stride_x, pool_stride_y);
684 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0],
id, 0, 1, pool_size,
685 upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
686 pool_stride_x, pool_stride_y);
688 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1],
id, 8, 1, pool_size,
689 upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
690 pool_stride_x, pool_stride_y);
701 if (pool_stride_x == 2)
704 static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14};
714 if (pool_stride_x == 1)
716 if (src_qinfo != dst_qinfo)
721 write16_boundary_aware<T, q8x8_t, false>(
id.x(), dst_w,
wrapper::vgetlow(fqres),
726 if (src_qinfo != dst_qinfo)
728 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
730 write8_boundary_aware<T, q8x8_t>(
id.x(), dst_w, fres,
reinterpret_cast<T *
>(out.ptr()));
736 template <
typename T>
737 void poolingMxN_quantized_neon_nchw(
const ITensor *
src,
740 PoolingLayerInfo &pool_info,
741 const Window &window_src,
742 const Window &window)
745 Iterator in(
src, window_src);
746 Iterator out(dst0, window);
749 using q16_t =
typename wrapper::traits::promote_t<T>;
750 using q32_t =
typename wrapper::traits::promote_t<q16_t>;
752 const int pool_size_x = pool_info.is_global_pooling ?
src->info()->tensor_shape().x() : pool_info.pool_size.width;
753 const int pool_size_y = pool_info.is_global_pooling ?
src->info()->tensor_shape().y() : pool_info.pool_size.height;
754 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
755 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
756 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
757 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
758 int pool_stride_x = 0;
759 int pool_stride_y = 0;
760 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
761 const int upper_bound_w =
src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
762 const int upper_bound_h =
src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
764 const UniformQuantizationInfo &src_qinfo =
src->info()->quantization_info().uniform();
765 const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
766 const int src_w =
src->info()->dimension(0);
767 const int src_h =
src->info()->dimension(1);
768 const T fill_value = (pool_info.pool_type ==
PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
769 const int stridex_in_bytes =
static_cast<int>(
src->info()->strides_in_bytes().x());
770 const int stridey_in_bytes =
static_cast<int>(
src->info()->strides_in_bytes().y());
774 [&](
const Coordinates &
id)
776 T res = std::numeric_limits<T>::min();
783 const float scale = calculate_avg_scale_pool2d(
784 pool_info.exclude_padding,
DataLayout::NCHW,
id, pool_size_x, pool_size_y, upper_bound_w,
785 upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
788 for (
int y = 0; y < pool_size_y; ++y)
790 for (
int x = 0; x < pool_size_x; ++x)
792 const auto in_ptr =
reinterpret_cast<const T *
>(
793 in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
795 const int idx = x +
id.x() * pool_stride_x - pool_pad_left;
796 const int idy = y +
id.y() * pool_stride_y - pool_pad_top;
797 const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
806 for (
int y = 0; y < pool_size_y; ++y)
808 for (
int x = 0; x < pool_size_x; ++x)
810 const auto in_ptr =
reinterpret_cast<const T *
>(
811 in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
813 const int idx = x +
id.x() * pool_stride_x - pool_pad_left;
814 const int idy = y +
id.y() * pool_stride_y - pool_pad_top;
815 const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
816 res = std::max(res, data);
824 *(
reinterpret_cast<T *
>(out.ptr())) = res;
832 #endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H