43 : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
57 _run_sobel_x = output_x !=
nullptr;
58 _run_sobel_y = output_y !=
nullptr;
73 _border_size =
BorderSize(border_undefined ? 0 : 2, 2);
77 constexpr
unsigned int num_elems_read_per_iteration = 16;
78 constexpr
unsigned int num_elems_written_per_iteration = 8;
92 INEKernel::configure(win);
110 output_x =
Iterator(_output_x, window);
115 output_y =
Iterator(_output_y, window);
118 if(_run_sobel_y && _run_sobel_x)
120 static const int16x8_t six = vdupq_n_s16(6);
121 static const int16x8_t four = vdupq_n_s16(4);
122 static const int16x8_t two = vdupq_n_s16(2);
123 static const int16x8_t minustwo = vdupq_n_s16(-2);
127 const uint8x16_t data = vld1q_u8(input.
ptr());
129 const int16x8x2_t data_s16 =
132 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
133 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
137 int16x8_t out_y = data_s16.val[0];
138 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
139 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
140 out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
141 out_y = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
143 vst1q_s16(reinterpret_cast<int16_t *>(output_y.
ptr()), out_y);
145 int16x8_t out_x = vnegq_s16(data_s16.val[0]);
146 out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
147 out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
148 out_x = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
150 vst1q_s16(reinterpret_cast<int16_t *>(output_x.
ptr()), out_x);
152 input, output_x, output_y);
154 else if(_run_sobel_x)
156 static const int16x8_t two = vdupq_n_s16(2);
157 static const int16x8_t minustwo = vdupq_n_s16(-2);
161 const uint8x16_t data = vld1q_u8(input.
ptr());
163 const int16x8x2_t data_s16 =
166 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
167 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
171 int16x8_t out = vnegq_s16(data_s16.val[0]);
172 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
173 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
174 out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
176 vst1q_s16(reinterpret_cast<int16_t *>(output_x.
ptr()), out);
180 else if(_run_sobel_y)
182 static const int16x8_t six = vdupq_n_s16(6);
183 static const int16x8_t four = vdupq_n_s16(4);
187 const uint8x16_t data = vld1q_u8(input.
ptr());
189 const int16x8x2_t data_s16 =
192 vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
193 vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
197 int16x8_t out = data_s16.val[0];
198 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
199 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
200 out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
201 out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
203 vst1q_s16(reinterpret_cast<int16_t *>(output_y.
ptr()), out);
210 : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
223 _run_sobel_x = output_x !=
nullptr;
224 _run_sobel_y = output_y !=
nullptr;
240 _output_x = output_x;
241 _output_y = output_y;
243 const ITensor *
const input = _run_sobel_x ? input_x : input_y;
247 constexpr
unsigned int num_elems_read_per_iteration = 16;
248 constexpr
unsigned int num_elems_written_per_iteration = 16;
249 constexpr
unsigned int num_rows_read_per_iteration = 5;
264 INEKernel::configure(win);
278 const int16_t *input_x_low2_ptr =
nullptr;
279 const int16_t *input_x_low_ptr =
nullptr;
280 const int16_t *input_x_mid_ptr =
nullptr;
281 const int16_t *input_x_top_ptr =
nullptr;
282 const int16_t *input_x_top2_ptr =
nullptr;
284 const int16_t *input_y_low2_ptr =
nullptr;
285 const int16_t *input_y_low_ptr =
nullptr;
286 const int16_t *input_y_top_ptr =
nullptr;
287 const int16_t *input_y_top2_ptr =
nullptr;
291 input_x =
Iterator(_input_x, window);
292 output_x =
Iterator(_output_x, window);
302 input_y =
Iterator(_input_y, window);
303 output_y =
Iterator(_output_y, window);
310 static const int16x8_t six = vdupq_n_s16(6);
311 static const int16x8_t four = vdupq_n_s16(4);
312 static const int16x8_t two = vdupq_n_s16(2);
313 static const int16x8_t minustwo = vdupq_n_s16(-2);
320 const size_t input_offset_high_s16 = input_x.
offset() / 2;
321 const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
325 int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
326 int16x8_t out_high = data_high;
328 data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
329 out_high = vmlaq_s16(out_high, data_high, four);
331 data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
332 out_high = vmlaq_s16(out_high, data_high, six);
334 data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
335 out_high = vmlaq_s16(out_high, data_high, four);
337 data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
338 out_high = vaddq_s16(out_high, data_high);
340 vst1q_s16((reinterpret_cast<int16_t *>(output_x.
ptr())), out_high);
344 int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
345 int16x8_t out_low = data_low;
347 data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
348 out_low = vmlaq_s16(out_low, data_low, four);
350 data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
351 out_low = vmlaq_s16(out_low, data_low, six);
353 data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
354 out_low = vmlaq_s16(out_low, data_low, four);
356 data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
357 out_low = vaddq_s16(out_low, data_low);
359 vst1q_s16((reinterpret_cast<int16_t *>(output_x.
ptr())) + 8, out_low);
369 const size_t input_offset_high_s16 = input_y.
offset() / 2;
370 const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
374 int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
375 int16x8_t out_high = vnegq_s16(data_high);
377 data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
378 out_high = vmlaq_s16(out_high, data_high, minustwo);
380 data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
381 out_high = vmlaq_s16(out_high, data_high, two);
383 data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
384 out_high = vaddq_s16(out_high, data_high);
386 vst1q_s16((reinterpret_cast<int16_t *>(output_y.
ptr())), out_high);
390 int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
391 int16x8_t out_low = vnegq_s16(data_low);
393 data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
394 out_low = vmlaq_s16(out_low, data_low, minustwo);
396 data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
397 out_low = vmlaq_s16(out_low, data_low, two);
399 data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
400 out_low = vaddq_s16(out_low, data_low);
402 vst1q_s16((reinterpret_cast<int16_t *>(output_y.
ptr())) + 8, out_low);
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void shift(size_t dimension, int shift_value)
Shift the values of a given dimension by the given shift_value.
Container for 2D border size.
1 channel, 1 U8 per channel
void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel's source, destination and border mode.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t,...)
Interface for Neon tensor.
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
Implementation of a rectangular access pattern.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
BorderSize border_size() const override
The size of the border for that kernel.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
NESobel5x5HorKernel()
Default constructor.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
NESobel5x5VertKernel()
Default constructor.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Iterator updated by execute_window_loop for each window element.
BorderSize border_size() const override
The size of the border for that kernel.
void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel's source, destination and border mode.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)