48 const int32x4_t minusfour = vdupq_n_s32(-4);
49 const int32x4_t minusfive = vdupq_n_s32(-5);
50 const int32x4_t four = vdupq_n_s32(4);
51 const int32x4_t five = vdupq_n_s32(5);
52 const int32x4_t six = vdupq_n_s32(6);
53 const int32x4_t fifteen = vdupq_n_s32(15);
54 const int32x4_t twenty = vdupq_n_s32(20);
56 inline int32x4x2_t compute_hor_sobel_x(
const int32x4x4_t &data)
61 vnegq_s32(data.val[0]),
62 vnegq_s32(data.val[1])
66 out.val[0] = vmlaq_s32(out.val[0],
67 vextq_s32(data.val[0], data.val[1], 1), minusfour);
69 out.val[0] = vmlaq_s32(out.val[0],
70 vextq_s32(data.val[0], data.val[1], 2), minusfive);
72 out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
74 out.val[0] = vmlaq_s32(out.val[0],
75 vextq_s32(data.val[1], data.val[2], 1), four);
77 out.val[0] = vaddq_s32(out.val[0],
78 vextq_s32(data.val[1], data.val[2], 2));
80 out.val[1] = vmlaq_s32(out.val[1],
81 vextq_s32(data.val[1], data.val[2], 1), minusfour);
83 out.val[1] = vmlaq_s32(out.val[1],
84 vextq_s32(data.val[1], data.val[2], 2), minusfive);
86 out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
88 out.val[1] = vmlaq_s32(out.val[1],
89 vextq_s32(data.val[2], data.val[3], 1), four);
91 out.val[1] = vaddq_s32(out.val[1],
92 vextq_s32(data.val[2], data.val[3], 2));
97 inline int32x4x2_t compute_hor_sobel_y(
const int32x4x4_t &data)
107 out.val[0] = vmlaq_s32(out.val[0],
108 vextq_s32(data.val[0], data.val[1], 1), six);
110 out.val[0] = vmlaq_s32(out.val[0],
111 vextq_s32(data.val[0], data.val[1], 2), fifteen);
113 out.val[0] = vmlaq_s32(out.val[0],
114 vextq_s32(data.val[0], data.val[1], 3), twenty);
116 out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
118 out.val[0] = vmlaq_s32(out.val[0],
119 vextq_s32(data.val[1], data.val[2], 1), six);
121 out.val[0] = vaddq_s32(out.val[0],
122 vextq_s32(data.val[1], data.val[2], 2));
124 out.val[1] = vmlaq_s32(out.val[1],
125 vextq_s32(data.val[1], data.val[2], 1), six);
127 out.val[1] = vmlaq_s32(out.val[1],
128 vextq_s32(data.val[1], data.val[2], 2), fifteen);
130 out.val[1] = vmlaq_s32(out.val[1],
131 vextq_s32(data.val[1], data.val[2], 3), twenty);
133 out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
135 out.val[1] = vmlaq_s32(out.val[1],
136 vextq_s32(data.val[2], data.val[3], 1), six);
138 out.val[1] = vaddq_s32(out.val[1],
139 vextq_s32(data.val[2], data.val[3], 2));
146 : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
160 _run_sobel_x = output_x !=
nullptr;
161 _run_sobel_y = output_y !=
nullptr;
174 _output_x = output_x;
175 _output_y = output_y;
176 _border_size =
BorderSize(border_undefined ? 0 : 3, 3);
180 constexpr
unsigned int num_elems_read_per_iteration = 16;
181 constexpr
unsigned int num_elems_written_per_iteration = 8;
195 INEKernel::configure(win);
210 output_x =
Iterator(_output_x, window);
215 output_y =
Iterator(_output_y, window);
218 if(_run_sobel_y && _run_sobel_x)
222 const uint8x16_t data = vld1q_u8(input.
ptr() - 3);
224 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
225 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
227 const int32x4x4_t data_s32 =
230 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
231 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
232 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
233 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
237 const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
238 vst1q_s32(reinterpret_cast<int32_t *>(output_y.
ptr()), out_y.val[0]);
239 vst1q_s32(reinterpret_cast<int32_t *>(output_y.
ptr()) + 4, out_y.val[1]);
241 const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
242 vst1q_s32(reinterpret_cast<int32_t *>(output_x.
ptr()), out_x.val[0]);
243 vst1q_s32(reinterpret_cast<int32_t *>(output_x.
ptr()) + 4, out_x.val[1]);
245 input, output_x, output_y);
247 else if(_run_sobel_x)
251 const uint8x16_t data = vld1q_u8(input.
ptr() - 3);
253 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
254 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
256 const int32x4x4_t data_s32 =
259 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
260 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
261 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
262 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
266 const int32x4x2_t out = compute_hor_sobel_x(data_s32);
267 vst1q_s32(reinterpret_cast<int32_t *>(output_x.
ptr()), out.val[0]);
268 vst1q_s32(reinterpret_cast<int32_t *>(output_x.
ptr()) + 4, out.val[1]);
272 else if(_run_sobel_y)
276 const uint8x16_t data = vld1q_u8(input.
ptr() - 3);
278 const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
279 const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
281 const int32x4x4_t data_s32 =
284 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
285 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
286 vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
287 vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
291 const int32x4x2_t out = compute_hor_sobel_y(data_s32);
292 vst1q_s32(reinterpret_cast<int32_t *>(output_y.
ptr()), out.val[0]);
293 vst1q_s32(reinterpret_cast<int32_t *>(output_y.
ptr()) + 4, out.val[1]);
300 : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
313 _run_sobel_x = (output_x !=
nullptr);
314 _run_sobel_y = (output_y !=
nullptr);
330 _output_x = output_x;
331 _output_y = output_y;
333 const ITensor *
const input = _run_sobel_x ? input_x : input_y;
337 constexpr
unsigned int num_elems_read_per_iteration = 8;
338 constexpr
unsigned int num_elems_written_per_iteration = 8;
339 constexpr
unsigned int num_rows_read_per_iteration = 7;
354 INEKernel::configure(win);
368 int32_t in_x_stride = 0;
369 int32_t in_y_stride = 0;
373 input_x =
Iterator(_input_x, window);
374 output_x =
Iterator(_output_x, window);
380 input_y =
Iterator(_input_y, window);
381 output_y =
Iterator(_output_y, window);
389 auto in_ptr =
reinterpret_cast<int32_t *
>(input_x.
ptr()) - 3 * in_x_stride;
396 vld1q_s32(in_ptr + 4)
400 int32x4x2_t out = data;
403 in_ptr += in_x_stride;
404 data.val[0] = vld1q_s32(in_ptr);
405 out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
407 data.val[1] = vld1q_s32(in_ptr + 4);
408 out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
411 in_ptr += in_x_stride;
412 data.val[0] = vld1q_s32(in_ptr);
413 out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
415 data.val[1] = vld1q_s32(in_ptr + 4);
416 out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
419 in_ptr += in_x_stride;
420 data.val[0] = vld1q_s32(in_ptr);
421 out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty);
423 data.val[1] = vld1q_s32(in_ptr + 4);
424 out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty);
427 in_ptr += in_x_stride;
428 data.val[0] = vld1q_s32(in_ptr);
429 out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
431 data.val[1] = vld1q_s32(in_ptr + 4);
432 out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
435 in_ptr += in_x_stride;
436 data.val[0] = vld1q_s32(in_ptr);
437 out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
439 data.val[1] = vld1q_s32(in_ptr + 4);
440 out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
443 in_ptr += in_x_stride;
444 data.val[0] = vld1q_s32(in_ptr);
445 out.val[0] = vaddq_s32(out.val[0], data.val[0]);
447 data.val[1] = vld1q_s32(in_ptr + 4);
448 out.val[1] = vaddq_s32(out.val[1], data.val[1]);
450 vst1q_s32(reinterpret_cast<int32_t *>(output_x.
ptr()) + 0, out.val[0]);
451 vst1q_s32(reinterpret_cast<int32_t *>(output_x.
ptr()) + 4, out.val[1]);
460 auto in_ptr =
reinterpret_cast<int32_t *
>(input_y.
ptr()) - 3 * in_y_stride;
467 vld1q_s32(in_ptr + 4)
474 vnegq_s32(data.val[0]),
475 vnegq_s32(data.val[1])
480 in_ptr += in_y_stride;
481 data.val[0] = vld1q_s32(in_ptr);
482 out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour);
484 data.val[1] = vld1q_s32(in_ptr + 4);
485 out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour);
488 in_ptr += in_y_stride;
489 data.val[0] = vld1q_s32(in_ptr);
490 out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive);
492 data.val[1] = vld1q_s32(in_ptr + 4);
493 out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive);
496 in_ptr += (2 * in_y_stride);
497 data.val[0] = vld1q_s32(in_ptr);
498 out.val[0] = vmlaq_s32(out.val[0], data.val[0], five);
500 data.val[1] = vld1q_s32(in_ptr + 4);
501 out.val[1] = vmlaq_s32(out.val[1], data.val[1], five);
504 in_ptr += in_y_stride;
505 data.val[0] = vld1q_s32(in_ptr);
506 out.val[0] = vmlaq_s32(out.val[0], data.val[0], four);
508 data.val[1] = vld1q_s32(in_ptr + 4);
509 out.val[1] = vmlaq_s32(out.val[1], data.val[1], four);
512 in_ptr += in_y_stride;
513 data.val[0] = vld1q_s32(in_ptr);
514 out.val[0] = vaddq_s32(out.val[0], data.val[0]);
516 data.val[1] = vld1q_s32(in_ptr + 4);
517 out.val[1] = vaddq_s32(out.val[1], data.val[1]);
519 vst1q_s32(reinterpret_cast<int32_t *>(output_y.
ptr()) + 0, out.val[0]);
520 vst1q_s32(reinterpret_cast<int32_t *>(output_y.
ptr()) + 4, out.val[1]);
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
BorderSize border_size() const override
The size of the border for that kernel.
const Window & window() const
The maximum window the kernel can be executed on.
Container for 2D border size.
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t,...)
Interface for Neon tensor.
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
1 channel, 1 S32 per channel
Implementation of a rectangular access pattern.
virtual Format format() const =0
Colour format of the image.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel's source, destination and border mode.
NESobel7x7HorKernel()
Default constructor.
NESobel7x7VertKernel()
Default constructor.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
size_t pixel_size_from_format(Format format)
The size in bytes of the pixel format.
void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel's source, destination and border mode.
unsigned int num_elems_processed_per_iteration
BorderSize border_size() const override
The size of the border for that kernel.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)