71 constexpr
unsigned int num_elems_read_per_iteration = 32;
72 constexpr
unsigned int num_elems_written_per_iteration = 8;
95 if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
107 INEKernel::configure(win);
117 static const int16x8_t six = vdupq_n_s16(6);
118 static const int16x8_t four = vdupq_n_s16(4);
133 const uint8x16x2_t data_2q = vld2q_u8(in.
ptr());
134 const uint8x16_t &data_even = data_2q.val[0];
135 const uint8x16_t &data_odd = data_2q.val[1];
137 const int16x8_t data_l2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_even)));
138 const int16x8_t data_l1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_odd)));
139 const int16x8_t data_m = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 1))));
140 const int16x8_t data_r1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_odd, data_odd, 1))));
141 const int16x8_t data_r2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 2))));
143 int16x8_t out_val = vaddq_s16(data_l2, data_r2);
144 out_val = vmlaq_s16(out_val, data_l1, four);
145 out_val = vmlaq_s16(out_val, data_m, six);
146 out_val = vmlaq_s16(out_val, data_r1, four);
148 vst1q_s16(reinterpret_cast<int16_t *>(out.
ptr()), out_val);
179 constexpr
unsigned int num_rows_processed_per_iteration = 2;
181 constexpr
unsigned int num_elems_written_per_iteration = 16;
182 constexpr
unsigned int num_rows_written_per_iteration = 1;
184 constexpr
unsigned int num_elems_read_per_iteration = 16;
185 constexpr
unsigned int num_rows_read_per_iteration = 5;
196 if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
198 _t2_load_offset += 1;
207 INEKernel::configure(win);
219 static const uint16x8_t six = vdupq_n_u16(6);
220 static const uint16x8_t four = vdupq_n_u16(4);
235 const uint8_t *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(0, 0));
236 const uint8_t *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(0, 1));
237 const uint8_t *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(0, 2));
238 const uint8_t *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(0, 3));
239 const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(
Coordinates(0, 4));
244 const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.
offset())));
245 const uint16x8_t data_low_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.
offset())));
246 const uint16x8_t data_low_m = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.
offset())));
247 const uint16x8_t data_low_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.
offset())));
248 const uint16x8_t data_low_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.
offset())));
250 uint16x8_t out_low = vaddq_u16(data_low_t2, data_low_b2);
251 out_low = vmlaq_u16(out_low, data_low_t1, four);
252 out_low = vmlaq_u16(out_low, data_low_m, six);
253 out_low = vmlaq_u16(out_low, data_low_b1, four);
258 const uint16x8_t data_high_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.
offset())));
259 const uint16x8_t data_high_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.
offset())));
260 const uint16x8_t data_high_m = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.
offset())));
261 const uint16x8_t data_high_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.
offset())));
262 const uint16x8_t data_high_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.
offset())));
264 uint16x8_t out_high = vaddq_u16(data_high_t2, data_high_b2);
265 out_high = vmlaq_u16(out_high, data_high_t1, four);
266 out_high = vmlaq_u16(out_high, data_high_m, six);
267 out_high = vmlaq_u16(out_high, data_high_b1, four);
269 vst1q_u8(out.
ptr(), vcombine_u8(vqshrn_n_u16(out_low, 8), vqshrn_n_u16(out_high, 8)));
unsigned int top
top of the border
void scale(size_t dimension, float scale_value)
Scale the values of a given dimension by the given scale_value.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's source, destination and border mode.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
void shift(size_t dimension, int shift_value)
Shift the values of a given dimension by the given shift_value.
Container for 2D border size.
void increment(size_t dimension)
Increment the iterator along the specified dimension of the step value associated to the dimension...
constexpr int step() const
Return the step of the dimension.
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's source, destination and border mode.
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Interface for Neon tensor.
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
Implementation of a rectangular access pattern.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
BorderSize border_size() const override
The size of the border for that kernel.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
NEGaussianPyramidVertKernel()
Default constructor.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
BorderSize border_size() const override
The size of the border for that kernel.
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Container for valid region of a window.
static constexpr size_t num_max_dimensions
Number of dimensions the tensor has.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
NEGaussianPyramidHorKernel()
Default constructor.