50 inline int32x4_t offset_nearest_interpolation(
const float *mapx_ptr,
const float *mapy_ptr,
const float32x4_t &width,
const float32x4_t &height,
const int32x4_t &stride)
52 const float32x4_t lowerxy = vdupq_n_f32(-1.f);
54 float32x4_t x = vld1q_f32(mapx_ptr);
55 float32x4_t y = vld1q_f32(mapy_ptr);
58 x = vmaxq_f32(lowerxy, vminq_f32(x, width));
59 y = vmaxq_f32(lowerxy, vminq_f32(y, height));
61 const int32x4_t x_s32 = vcvtq_s32_f32(x);
62 const int32x4_t y_s32 = vcvtq_s32_f32(y);
64 return vmlaq_s32(x_s32, y_s32, stride);
70 : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
95 _func = &NERemapKernel::remap_nearest;
126 INEKernel::configure(win);
142 const float32x4_t width = vdupq_n_f32(static_cast<float>(_input->
info()->
dimension(0)));
143 const float32x4_t height = vdupq_n_f32(static_cast<float>(_input->
info()->
dimension(1)));
144 const int32x4_t in_stride = vdupq_n_s32(static_cast<int32_t>(_input->
info()->
strides_in_bytes()[1]));
148 const auto mapx_ptr =
reinterpret_cast<const float *
>(mapx.
ptr());
149 const auto mapy_ptr =
reinterpret_cast<const float *
>(mapy.
ptr());
150 const uint8_t *in_ptr = in.
ptr();
152 const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride);
153 const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride);
154 const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
155 const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
157 uint8x16_t tmp = vdupq_n_u8(0);
158 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
159 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
160 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
161 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
162 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
163 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
164 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
165 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
166 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8);
167 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9);
168 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10);
169 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11);
170 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12);
171 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13);
172 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14);
173 tmp = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15);
174 vst1q_u8(out.
ptr(), tmp);
176 in, out, mapx, mapy);
181 using namespace scale_helpers;
194 const size_t width = _input->info()->dimension(0);
195 const size_t height = _input->info()->dimension(1);
196 const size_t in_stride = _input->info()->strides_in_bytes()[1];
200 const auto mapx_ptr =
reinterpret_cast<float *
>(mapx.
ptr());
201 const auto mapy_ptr =
reinterpret_cast<float *
>(mapy.
ptr());
202 const uint8_t *in_ptr = in.
ptr();
204 uint8x8_t tmp0 = vdup_n_u8(0);
205 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
206 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
207 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
208 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
209 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
210 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
211 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
212 tmp0 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
214 uint8x8_t tmp1 = vdup_n_u8(0);
215 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
216 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
217 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
218 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
219 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
220 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
221 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
222 tmp1 = vset_lane_u8(
pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
224 vst1q_u8(out.
ptr(), vcombine_u8(tmp0, tmp1));
226 in, out, mapx, mapy);
236 (this->*_func)(window);
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
InterpolationPolicy
Interpolation method.
const Window & window() const
The maximum window the kernel can be executed on.
NERemapKernel()
Default constructor.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
1 channel, 1 U8 per channel
1 channel, 1 F32 per channel
Output values are defined by bilinear interpolation between the pixels.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Describe one of the image's dimensions with a start, end and step.
unsigned int bottom
bottom of the border
Output values are defined to match the source pixel whose center is nearest to the sample position...
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
Implementation of a static rectangular access pattern.
void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
Initialize the kernel's input, output and border mode.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
BorderSize border_size() const override
The size of the border for that kernel.
unsigned int left
left of the border
unsigned int right
right of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Container for valid region of a window.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
__kernel void remap_bilinear(__global uchar *in_ptr, uint in_stride_x, uint in_step_x, uint in_stride_y, uint in_step_y, uint in_offset_first_element_in_bytes, __global uchar *out_ptr, uint out_stride_x, uint out_step_x, uint out_stride_y, uint out_step_y, uint out_offset_first_element_in_bytes, __global uchar *mapx_ptr, uint mapx_stride_x, uint mapx_step_x, uint mapx_stride_y, uint mapx_step_y, uint mapx_offset_first_element_in_bytes, __global uchar *mapy_ptr, uint mapy_stride_x, uint mapy_step_x, uint mapy_stride_y, uint mapy_step_y, uint mapy_offset_first_element_in_bytes, const float width, const float height)
Performs a remapping of an input image to an output given two remapping image using bilinear as inter...