24 #ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H 25 #define SRC_CORE_NEON_KERNELS_QUANTIZED_H 40 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>
::type 47 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>
::type 59 return vcvtq_u32_f32(values);
65 return vcvtq_s32_f32(values);
74 return vcvtq_f32_u32(values);
80 return vcvtq_f32_s32(values);
83 template <
typename Tout>
84 inline Tout
vrequantize_pooling_with_scale(
const float32x4x4_t &acc,
const float quant_rescale,
const float scale_pooling,
const int32_t new_offset);
89 const float new_scale = quant_rescale / scale_pooling;
96 const float new_scale = quant_rescale / scale_pooling;
100 template <
typename Tin,
typename Tout>
106 const float32x4x4_t acc =
109 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
110 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
111 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
112 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
121 const float32x4x4_t acc =
124 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
125 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
126 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
127 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
133 template <
typename T>
139 const float32x4x2_t acc =
142 vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
143 vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
152 const float32x4x2_t acc =
155 vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
156 vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
163 const int pad_x,
const int pad_y,
const int stride_x,
const int stride_y)
168 int start_x =
id[
idx_width] * stride_x - pad_x;
169 int start_y =
id[
idx_height] * stride_y - pad_y;
171 const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
172 const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
175 start_x = std::max(0, start_x);
176 start_y = std::max(0, start_y);
178 return 1.f / ((end_y - start_y) * (end_x - start_x));
181 template <
typename T>
186 const int window_start_x = window.
x().
start();
187 const int window_end_x = window.
x().
end();
188 const int window_step_x = 16;
189 const int window_half_step_x = window_step_x / 2;
191 Window window_out = window;
211 int pool_stride_x = 0;
212 int pool_stride_y = 0;
217 const float32x4_t half_scale_v = vdupq_n_f32(0.5f);
221 const float quant_rescale = dst_qinfo.
scale / src_qinfo.
scale;
224 const int32_t new_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / quant_rescale);
226 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
227 const int32_t requant_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
232 const int idx_width =
id.y() * pool_stride_x;
233 const int idx_height =
id.z() * pool_stride_y;
234 const int pool_limit_y = pool_pad_top -
idx_height;
235 const int pool_limit_x = pool_pad_left -
idx_width;
237 const int pool_start_y = std::max(0, window_src.
z().
start() + pool_limit_y);
238 const int pool_end_y = std::min(pool_size_y, window_src.
z().
end() + pool_limit_y);
239 const int pool_start_x = std::max(0, window_src.
y().
start() + pool_limit_x);
240 const int pool_end_x = std::min(pool_size_x, window_src.
y().
end() + pool_limit_x);
242 int x_off = window_start_x;
243 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
253 const float scale =
calculate_avg_scale(pool_info.
exclude_padding,
DataLayout::NHWC,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
257 for(
int y = pool_start_y; y < pool_end_y; ++y)
259 for(
int x = pool_start_x; x < pool_end_x; ++x)
261 const q8x16_t data =
wrapper::vloadq(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 273 if(src_qinfo != dst_qinfo)
275 const float32x4x4_t vres =
284 const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale,
scale, new_offset);
291 const float32x4_t scale_v = vdupq_n_f32(scale);
309 for(
int y = pool_start_y; y < pool_end_y; ++y)
311 for(
int x = pool_start_x; x < pool_end_x; ++x)
313 const q8x16_t data =
wrapper::vloadq(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 328 for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
331 for(
int y = pool_start_y; y < pool_end_y; ++y)
333 for(
int x = pool_start_x; x < pool_end_x; ++x)
335 const q8x8_t data =
wrapper::vload(reinterpret_cast<const T *>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 343 (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
348 for(; x_off < window_end_x; ++x_off)
352 q32_t res =
static_cast<q32_t
>(0.f);
355 const float scale =
calculate_avg_scale(pool_info.
exclude_padding,
DataLayout::NHWC,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
359 for(
int y = pool_start_y; y < pool_end_y; ++y)
361 for(
int x = pool_start_x; x < pool_end_x; ++x)
363 const T data = *(
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 369 if(src_qinfo != dst_qinfo)
371 const float res_f =
static_cast<float>(res);
372 const float new_scale = quant_rescale /
scale;
376 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = requantized_dst;
381 res =
static_cast<T
>(0.5f +
static_cast<float>(res) * scale);
384 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = res;
389 T res = std::numeric_limits<T>::min();
391 for(
int y = pool_start_y; y < pool_end_y; ++y)
393 for(
int x = pool_start_x; x < pool_end_x; ++x)
395 const T data = *(
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * static_cast<int>(src->
info()->
strides_in_bytes().
y()) + (y - pool_pad_top) *
static_cast<int> 397 res = std::max(res, data);
402 if(src_qinfo != dst_qinfo)
404 const float res_f =
static_cast<float>(res);
405 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
409 *(
reinterpret_cast<T *
>(out.
ptr()) + x_off) = res;
418 #if defined(ENABLE_NCHW_KERNELS) 419 template <
typename T,
typename TVec>
420 inline void scale_vector_q16x8(
bool exclude_padding, TVec &v,
const Coordinates &
id,
int id_offset,
int step,
421 const int pool_size,
const int upper_bound_w,
const int upper_bound_h,
422 const int pad_x,
const int pad_y,
const int stride_x,
const int stride_y)
424 int start_x = (
id.x() + id_offset) * stride_x - pad_x;
425 int start_y =
id.y() * stride_y - pad_y;
426 const int end_y = std::min(start_y + pool_size, upper_bound_h);
429 start_y = std::max(0, start_y);
432 std::array<T, 8> elems =
446 for(
auto &el : elems)
448 int c_start_x = start_x;
449 const int end_x = std::min(c_start_x + pool_size, upper_bound_w);
452 c_start_x = std::max(0, c_start_x);
454 float scale = 1.f / ((end_y - start_y) * (end_x - c_start_x));
456 start_x += step * stride_x;
469 template <
typename T>
470 auto load16_boundary_aware(
int srcw,
int srch,
int pad_l,
int pad_r,
int pad_t,
int pad_b,
int x,
int y,
const T *ptr, T fval)
475 const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
476 for(
int i = 0; i < 16; i++)
478 if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
490 template <
typename T,
typename V,
bool de
interleave>
491 inline void write16_boundary_aware(
int x,
int dst_w,
const V &lower,
const V &upper, T *ptr)
495 for(
int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
497 *(ptr + i * 2) = lower[i];
499 for(
int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
501 *(ptr + 1 + i * 2) = upper[i];
506 for(
int i = 0; i < 8 && (i + x) < dst_w; ++i)
508 *(ptr + i) = lower[i];
510 for(
int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
512 *(ptr + i + 8) = upper[i];
517 template <
typename T,
typename V>
518 inline void write8_boundary_aware(
int x,
int dst_w,
const V &v, T *ptr)
520 for(
int i = 0; i < 8 && (i + x) < dst_w; ++i)
526 template <
typename T>
541 constexpr
int pool_size = 2;
542 int pool_stride_x = 0;
543 int pool_stride_y = 0;
551 const T *
const src_top_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
552 const T *
const src_bottom_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
553 const int scale_step_x = (pool_stride_x == 1) ? 2 : 1;
556 const bool have_different_qinfo = src_qinfo != dst_qinfo;
558 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
559 const int32_t requant_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
569 const auto x_val =
id.
x() * pool_stride_x;
570 const auto y_val_0 =
id.y() * pool_stride_y;
571 const auto y_val_1 = (
id.y() * pool_stride_y) + 1;
573 auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
574 x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.
offset()), fill_value);
575 auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
576 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.
offset()), fill_value);
578 q8x8_t lower_res = {};
579 q8x8_t upper_res = {};
587 const q16x8x2_t vrsum =
602 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, res_lower, id, 0, scale_step_x,
603 pool_size, upper_bound_w, upper_bound_h,
604 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
608 if(pool_stride_x == 1)
611 const q16x8x2_t vrsum_shifted =
625 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, res_upper, id, 1, 2,
626 pool_size, upper_bound_w, upper_bound_h,
627 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
633 const q8x16_t max_data =
wrapper::vmax(top_data, bottom_data);
635 if(pool_stride_x == 1)
642 if(have_different_qinfo)
644 const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
648 auto out_ptr =
reinterpret_cast<T *
>(out.
ptr());
650 if(pool_stride_x == 1)
652 write16_boundary_aware<T, q8x8_t, true>(
id.x(), dst_w, lower_res, upper_res, out_ptr);
656 write8_boundary_aware<T, q8x8_t>(
id.x(), dst_w, lower_res, out_ptr);
662 template <
typename T>
672 using q8x8x2_t =
typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>
::type;
677 constexpr
int pool_size = 3;
682 int pool_stride_x = 0;
683 int pool_stride_y = 0;
691 const float requant_scale = dst_qinfo.
scale / src_qinfo.
scale;
692 const int32_t requant_offset = dst_qinfo.
offset -
static_cast<int32_t
>(
static_cast<float>(src_qinfo.
offset) / requant_scale);
695 const T *
const src_top_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
696 const T *
const src_middle_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
697 const T *
const src_bottom_ptr =
reinterpret_cast<const T *
>(src->
ptr_to_element(
Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
706 const auto x_val =
id.
x() * pool_stride_x;
707 const auto y_val_0 =
id.y() * pool_stride_y;
708 const auto y_val_1 = (
id.y() * pool_stride_y) + 1;
709 const auto y_val_2 = (
id.y() * pool_stride_y) + 2;
711 auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
712 x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.
offset()), fill_value);
713 auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
714 x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.
offset()), fill_value);
715 auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
716 x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.
offset()), fill_value);
729 const q16x8x2_t vrsum =
736 const q16x8x2_t vrsum_shifted_1 =
743 const q16x8x2_t vrsum_shifted_2 =
751 q16x8x2_t final_sum =
758 if(pool_stride_x == 2)
772 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, res, id, 0, 1,
773 pool_size, upper_bound_w, upper_bound_h,
774 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
780 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, final_sum.val[0], id, 0, 1,
781 pool_size, upper_bound_w, upper_bound_h,
782 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
784 scale_vector_q16x8<q16_t, q16x8_t>(pool_info.
exclude_padding, final_sum.val[1], id, 8, 1,
785 pool_size, upper_bound_w, upper_bound_h,
786 pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
797 if(pool_stride_x == 2)
800 static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
810 if(pool_stride_x == 1)
812 if(src_qinfo != dst_qinfo)
820 if(src_qinfo != dst_qinfo)
822 fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
824 write8_boundary_aware<T, q8x8_t>(
id.x(), dst_w, fres,
reinterpret_cast<T *
>(out.
ptr()));
830 template <
typename T>
847 int pool_stride_x = 0;
848 int pool_stride_y = 0;
863 T res = std::numeric_limits<T>::min();
870 const float scale =
calculate_avg_scale(pool_info.
exclude_padding,
DataLayout::NCHW,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
874 for(
int y = 0; y < pool_size_y; ++y)
876 for(
int x = 0; x < pool_size_x; ++x)
878 const auto in_ptr =
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
880 const int idx = x +
id.x() * pool_stride_x - pool_pad_left;
881 const int idy = y +
id.y() * pool_stride_y - pool_pad_top;
882 const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
891 for(
int y = 0; y < pool_size_y; ++y)
893 for(
int x = 0; x < pool_size_x; ++x)
895 const auto in_ptr =
reinterpret_cast<const T *
>(in.
ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
897 const int idx = x +
id.x() * pool_stride_x - pool_pad_left;
898 const int idy = y +
id.y() * pool_stride_y - pool_pad_top;
899 const T data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
900 res = std::max(res, data);
906 *(
reinterpret_cast<T *
>(out.
ptr())) = res;
914 #endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
uint32x2_t vmovn(const uint64x2_t &a)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given an unsigned 8-bit asymmetric quantization scheme.
uint8x16_t vloadq(const uint8_t *ptr)
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vext_2(uint8x8_t value_a, uint8x8_t value_b)
Describe one of the image's dimensions with a start, end and step.
unsigned int pad_top() const
Get the top padding.
constexpr const Dimension & z() const
Alias to access the third dimension of the window.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
size_t height
Height of the image region or rectangle.
float32x4_t vcvtq_f32_q32(T values)
typename promote< T >::type promote_t
Get promoted type.
T x() const
Alias to access the size of the first dimension.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
uint8x8_t vext_1(uint8x8_t value_a, uint8x8_t value_b)
Create the appropriate SIMD vector given its type and size in terms of elements.
T vcvtq_q32_f32(float32x4_t values)
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
std::enable_if< std::is_same< T, int8_t >::value, int8_t >::type quantize(float val, const UniformQuantizationInfo &info)
int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a signed 8-bit asymmetric quantization scheme.
T z() const
Alias to access the size of the third dimension.
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
Tout vrequantize_pooling(Tin vec1, Tin vec2, const UniformQuantizationInfo &requant_qinfo)
Pooling Layer Information struct.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
uint8x8_t vtbl(const uint8x8x2_t &a, const uint8x8_t &b)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
Num samples, channels, height, width.
uint8x8_t vgethigh(const uint8x16_t val)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
PadStrideInfo pad_stride_info
size_t width
Width of the image region or rectangle.
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
T round(T value)
Round floating-point value with half value rounding away from zero.
Num samples, height, width, channels.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
T y() const
Alias to access the size of the second dimension.
Includes all wrapper headers at once.
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
uint8x8_t vpmax(const uint8x8_t &a, const uint8x8_t &b)
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
constexpr int end() const
Return the end of the dimension.
unsigned int pad_bottom() const
Get the bottom padding.
Iterator updated by execute_window_loop for each window element.
uint16x8_t vmovl(const uint8x8_t &a)
unsigned int pad_left() const
Get the left padding.
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
DataLayout
[DataLayout enum definition]
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.