24 #ifndef SRC_CORE_NEON_KERNELS_SCALE_LIST_H 25 #define SRC_CORE_NEON_KERNELS_SCALE_LIST_H 40 #define DECLARE_SCALE_KERNEL(func_name) \ 41 void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \ 42 InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \ 43 bool align_corners, const Window &window) 50 #undef DECLARE_SCALE_KERNEL 54 bool align_corners,
const Window &window)
69 const int step_cout = 16 /
sizeof(T);
71 Window window_execution = window;
79 const int xo_start = window_execution.
y().
start();
80 const int xo_end = window_execution.
y().
end();
81 const int xo_step = window_execution.
y().
step();
82 const int yo_start = window_execution.
z().
start();
83 const int yo_end = window_execution.
z().
end();
84 const int yo_step = window_execution.
z().
step();
85 const int bo_start = window_execution[3].start();
86 const int bo_end = window_execution[3].end();
87 const int bo_step = window_execution[3].step();
89 for(
int bo = bo_start; bo < bo_end; bo += bo_step)
91 const uint8_t *in_ptr_base = in.
ptr() + bo * in_stride_w;
92 uint8_t *out_ptr_base = out.
ptr() + bo * out_stride_w;
94 for(
int yo = yo_start; yo < yo_end; yo += yo_step)
97 float yi_f = ((yo + sampling_offset) * scale_y);
105 yi =
static_cast<int>(std::floor(yi_f));
108 for(
int xo = xo_start; xo < xo_end; xo += xo_step)
111 float xi_f = ((xo + sampling_offset) * scale_x);
119 xi =
static_cast<int>(std::floor(xi_f));
122 const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
123 uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
126 for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
128 auto out0 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T)));
129 wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout *
sizeof(T)), out0);
132 for(; cout < out_dim_ch; ++cout)
134 auto out0 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T)));
135 *(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T))) = out0;
142 template <
typename T>
145 bool align_corners,
const Window &window)
165 const int step_cout = 16 /
sizeof(T);
167 Window window_execution = window;
169 Window win_in_out(window);
175 const int xo_start = window_execution.
y().
start();
176 const int xo_end = window_execution.
y().
end();
177 const int xo_step = window_execution.
y().
step();
178 const int yo_start = window_execution.
z().
start();
179 const int yo_end = window_execution.
z().
end();
180 const int yo_step = window_execution.
z().
step();
181 const int bo_start = window_execution[3].start();
182 const int bo_end = window_execution[3].end();
183 const int bo_step = window_execution[3].step();
187 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 188 using ConstType =
typename std::conditional<std::is_same<T, float16_t>::value,
half, T>
::type;
192 const T const_border_value =
static_cast<T
>(constant_border_value.
get<ConstType>());
194 for(
int bo = bo_start; bo < bo_end; bo += bo_step)
196 const uint8_t *in_ptr_base = in.
ptr() + bo * in_stride_w;
197 uint8_t *out_ptr_base = out.
ptr() + bo * out_stride_w;
199 for(
int yo = yo_start; yo < yo_end; yo += yo_step)
202 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
204 const auto yi =
static_cast<int>(std::floor(yi_f));
206 const auto a1 = (yi_f -
static_cast<float>(yi));
207 const auto b1 = (1.f - a1);
209 for(
int xo = xo_start; xo < xo_end; xo += xo_step)
212 const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
214 const auto xi =
static_cast<int>(std::floor(xi_f));
216 const auto a = (xi_f -
static_cast<float>(xi));
217 const auto b = (1.f - a);
219 const auto s00_s =
static_cast<T
>(
b * b1);
220 const auto s01_s =
static_cast<T
>(a * b1);
221 const auto s10_s =
static_cast<T
>(
b * a1);
222 const auto s11_s =
static_cast<T
>(a * a1);
224 const uint8_t *in_ptr = in_ptr_base + xi * in_stride_y + yi * in_stride_z;
225 uint8_t *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
228 for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
230 auto in00 =
wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
231 auto in01 =
wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
232 auto in10 =
wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
233 auto in11 =
wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
234 if((yi >= 0) && (yi < in_dim_h))
236 if((xi >= 0) && (xi < in_dim_w))
238 in00 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T)));
240 if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
242 in01 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + in_stride_y));
245 if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
247 if((xi >= 0) && (xi < in_dim_w))
249 in10 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + in_stride_z));
251 if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
253 in11 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + in_stride_y + in_stride_z));
266 wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout *
sizeof(T)), out0);
269 for(; cout < out_dim_ch; ++cout)
271 auto in00 =
static_cast<T
>(const_border_value);
272 auto in01 =
static_cast<T
>(const_border_value);
273 auto in10 =
static_cast<T
>(const_border_value);
274 auto in11 =
static_cast<T
>(const_border_value);
275 if((yi >= 0) && (yi < in_dim_h))
277 if((xi >= 0) && (xi < in_dim_w))
279 in00 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T)));
281 if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
283 in01 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_y));
286 if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
288 if((xi >= 0) && (xi < in_dim_w))
290 in10 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_z));
292 if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
294 in11 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + in_stride_y + in_stride_z));
297 auto out0 =
static_cast<T
>(0);
298 out0 += in00 * s00_s;
299 out0 += in01 * s01_s;
300 out0 += in10 * s10_s;
301 out0 += in11 * s11_s;
302 *(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T))) = out0;
310 for(
int bo = bo_start; bo < bo_end; bo += bo_step)
312 const uint8_t *in_ptr = in.
ptr() + bo * in_stride_w;
313 uint8_t *out_ptr = out.
ptr() + bo * out_stride_w;
315 for(
int yo = yo_start; yo < yo_end; yo += yo_step)
318 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
320 const auto yi =
static_cast<int>(std::floor(yi_f));
322 const auto a1 = (yi_f -
static_cast<float>(yi));
323 const auto b1 = (1.f - a1);
325 const auto yi0 = utility::clamp<int>(yi, 0, in_dim_h - 1);
326 const auto yi1 = utility::clamp<int>(yi + 1, 0, in_dim_h - 1);
328 for(
int xo = xo_start; xo < xo_end; xo += xo_step)
331 const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
333 const auto xi =
static_cast<int>(std::floor(xi_f));
335 const auto a = (xi_f -
static_cast<float>(xi));
336 const auto b = (1.f - a);
338 const auto s00_s =
static_cast<T
>(
b * b1);
339 const auto s01_s =
static_cast<T
>(a * b1);
340 const auto s10_s =
static_cast<T
>(
b * a1);
341 const auto s11_s =
static_cast<T
>(a * a1);
343 const auto xi0 = utility::clamp<int>(xi, 0, in_dim_w - 1);
344 const auto xi1 = utility::clamp<int>(xi + 1, 0, in_dim_w - 1);
347 for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
353 in00 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + (xi0) * in_stride_y + (yi0) * in_stride_z));
354 in01 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + (xi1) * in_stride_y + (yi0) * in_stride_z));
355 in10 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + (xi0) * in_stride_y + (yi1) * in_stride_z));
356 in11 =
wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout *
sizeof(T) + (xi1) * in_stride_y + (yi1) * in_stride_z));
367 wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout *
sizeof(T) + xo * out_stride_y + yo * out_stride_z), out0);
370 for(; cout < out_dim_ch; ++cout)
372 auto in00 =
static_cast<T
>(0);
373 auto in01 =
static_cast<T
>(0);
374 auto in10 =
static_cast<T
>(0);
375 auto in11 =
static_cast<T
>(0);
376 in00 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + (xi0) * in_stride_y + (yi0) * in_stride_z));
377 in01 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + (xi1) * in_stride_y + (yi0) * in_stride_z));
378 in10 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + (xi0) * in_stride_y + (yi1) * in_stride_z));
379 in11 = *(
reinterpret_cast<const T *
>(in_ptr + cout *
sizeof(T) + (xi1) * in_stride_y + (yi1) * in_stride_z));
380 auto out0 =
static_cast<T
>(0);
381 out0 += in00 * s00_s;
382 out0 += in01 * s01_s;
383 out0 += in10 * s10_s;
384 out0 += in11 * s11_s;
385 *(
reinterpret_cast<T *
>(out_ptr + cout *
sizeof(T) + xo * out_stride_y + yo * out_stride_z)) = out0;
397 template <
typename T>
400 bool align_corners,
const Window &window)
404 bilinear_neon_scale<T>(
src,
dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
408 nearest_neon_scale<T>(
src,
dst, offsets, sampling_offset, align_corners, window);
BorderMode
Methods available to handle borders.
Class describing the value of a pixel for any image format.
InterpolationPolicy
Interpolation method.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
constexpr int step() const
Return the step of the dimension.
void get(uint8_t &v) const
Interpret the pixel value as a U8.
uint8x16_t vloadq(const uint8_t *ptr)
half_float::half half
16-bit floating point type
Output values are defined by bilinear interpolation between the pixels.
Describe one of the image's dimensions with a start, end and step.
void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset, bool align_corners, const Window &window)
constexpr const Dimension & z() const
Alias to access the third dimension of the window.
T round_half_away_from_zero(T value)
Round floating-point value with half value rounding away from zero.
Output values are defined to match the source pixel whose center is nearest to the sample position...
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
typename neon_bitvector< T, BW >::tag_type neon_bitvector_tag_t
Helper type template to get the tag type of a neon vector.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
#define DECLARE_SCALE_KERNEL(func_name)
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Pixels outside the image are assumed to have the same value as the closest image pixel.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
void vstore(uint8_t *ptr, uint8x8_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners=false)
Returns resize ratio between input and output with consideration of aligned corners.
Includes all wrapper headers at once.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
constexpr int start() const
Return the start of the dimension.
void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, bool align_corners, const Window &window)
Describe a multidimensional execution window.