59 _border_size =
BorderSize(border_undefined ? 0 : 2, 2);
63 constexpr
unsigned int num_elems_read_per_iteration = 16;
64 constexpr
unsigned int num_elems_written_per_iteration = 8;
75 INEKernel::configure(win);
90 static const int16x8_t six = vdupq_n_s16(6);
91 static const int16x8_t four = vdupq_n_s16(4);
95 uint8x16_t data = vld1q_u8(input.
ptr());
97 const int16x8x2_t data_s16 =
100 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
101 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
105 int16x8_t out = vaddq_s16(data_s16.val[0], vextq_s16(data_s16.val[0], data_s16.val[1], 4));
106 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
107 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
108 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
110 vst1q_s16(reinterpret_cast<int16_t *>(output.
ptr()), out);
134 constexpr
unsigned int num_elems_read_per_iteration = 32;
135 constexpr
unsigned int num_elems_written_per_iteration = 16;
136 constexpr
unsigned int num_rows_read_per_iteration = 5;
147 INEKernel::configure(win);
159 const uint8_t *input_top2_ptr = _input->ptr_to_element(
Coordinates(0, -2));
160 const uint8_t *input_top_ptr = _input->ptr_to_element(
Coordinates(0, -1));
161 const uint8_t *input_mid_ptr = _input->ptr_to_element(
Coordinates(0, 0));
162 const uint8_t *input_low_ptr = _input->ptr_to_element(
Coordinates(0, 1));
163 const uint8_t *input_low2_ptr = _input->ptr_to_element(
Coordinates(0, 2));
165 const uint16x8_t six = vdupq_n_u16(6);
166 const uint16x8_t four = vdupq_n_u16(4);
170 const size_t input_offset_high_s16 = input.
offset();
171 const size_t input_offset_low_s16 = input.
offset() + 16;
175 uint16x8_t data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_high_s16)));
176 uint16x8_t out_high = data_high;
178 data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_high_s16)));
179 out_high = vmlaq_u16(out_high, data_high, four);
181 data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_high_s16)));
182 out_high = vmlaq_u16(out_high, data_high, six);
184 data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_high_s16)));
185 out_high = vmlaq_u16(out_high, data_high, four);
187 data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_high_s16)));
188 out_high = vaddq_u16(out_high, data_high);
192 uint16x8_t data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_low_s16)));
193 uint16x8_t out_low = data_low;
195 data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_low_s16)));
196 out_low = vmlaq_u16(out_low, data_low, four);
198 data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_low_s16)));
199 out_low = vmlaq_u16(out_low, data_low, six);
201 data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_low_s16)));
202 out_low = vmlaq_u16(out_low, data_low, four);
204 data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_low_s16)));
205 out_low = vaddq_u16(out_low, data_low);
207 vst1q_u8(output.
ptr(), vcombine_u8(vqshrn_n_u16(out_high, 8),
208 vqshrn_n_u16(out_low, 8)));
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
BorderSize border_size() const override
The size of the border for that kernel.
const Window & window() const
The maximum window the kernel can be executed on.
NEGaussian5x5VertKernel()
Default constructor.
void shift(size_t dimension, int shift_value)
Shift the values of a given dimension by the given shift_value.
Container for 2D border size.
1 channel, 1 U8 per channel
Interface for Neon tensor.
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
Implementation of a rectangular access pattern.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
unsigned int num_elems_processed_per_iteration
void configure(const ITensor *input, ITensor *output, bool border_undefined)
Initialise the kernel's source, destination and border mode.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Iterator updated by execute_window_loop for each window element.
void configure(const ITensor *input, ITensor *output, bool border_undefined)
Initialise the kernel's source, destination and border mode.
NEGaussian5x5HorKernel()
Default constructor.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
BorderSize border_size() const override
The size of the border for that kernel.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.