Compute Library
 21.02
NESobel7x7Kernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
30 #include "arm_compute/core/Types.h"
31 #include "arm_compute/core/Utils.h"
35 
36 #include <arm_neon.h>
37 #include <cstdint>
38 
39 using namespace arm_compute;
40 
41 namespace arm_compute
42 {
43 class Coordinates;
44 } // namespace arm_compute
45 
46 namespace
47 {
48 const int32x4_t minusfour = vdupq_n_s32(-4);
49 const int32x4_t minusfive = vdupq_n_s32(-5);
50 const int32x4_t four = vdupq_n_s32(4);
51 const int32x4_t five = vdupq_n_s32(5);
52 const int32x4_t six = vdupq_n_s32(6);
53 const int32x4_t fifteen = vdupq_n_s32(15);
54 const int32x4_t twenty = vdupq_n_s32(20);
55 
56 inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
57 {
58  int32x4x2_t out =
59  {
60  {
61  vnegq_s32(data.val[0]),
62  vnegq_s32(data.val[1])
63  }
64  };
65 
66  out.val[0] = vmlaq_s32(out.val[0],
67  vextq_s32(data.val[0], data.val[1], 1), minusfour);
68 
69  out.val[0] = vmlaq_s32(out.val[0],
70  vextq_s32(data.val[0], data.val[1], 2), minusfive);
71 
72  out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
73 
74  out.val[0] = vmlaq_s32(out.val[0],
75  vextq_s32(data.val[1], data.val[2], 1), four);
76 
77  out.val[0] = vaddq_s32(out.val[0],
78  vextq_s32(data.val[1], data.val[2], 2));
79 
80  out.val[1] = vmlaq_s32(out.val[1],
81  vextq_s32(data.val[1], data.val[2], 1), minusfour);
82 
83  out.val[1] = vmlaq_s32(out.val[1],
84  vextq_s32(data.val[1], data.val[2], 2), minusfive);
85 
86  out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
87 
88  out.val[1] = vmlaq_s32(out.val[1],
89  vextq_s32(data.val[2], data.val[3], 1), four);
90 
91  out.val[1] = vaddq_s32(out.val[1],
92  vextq_s32(data.val[2], data.val[3], 2));
93 
94  return out;
95 }
96 
97 inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
98 {
99  int32x4x2_t out =
100  {
101  {
102  data.val[0],
103  data.val[1]
104  }
105  };
106 
107  out.val[0] = vmlaq_s32(out.val[0],
108  vextq_s32(data.val[0], data.val[1], 1), six);
109 
110  out.val[0] = vmlaq_s32(out.val[0],
111  vextq_s32(data.val[0], data.val[1], 2), fifteen);
112 
113  out.val[0] = vmlaq_s32(out.val[0],
114  vextq_s32(data.val[0], data.val[1], 3), twenty);
115 
116  out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
117 
118  out.val[0] = vmlaq_s32(out.val[0],
119  vextq_s32(data.val[1], data.val[2], 1), six);
120 
121  out.val[0] = vaddq_s32(out.val[0],
122  vextq_s32(data.val[1], data.val[2], 2));
123 
124  out.val[1] = vmlaq_s32(out.val[1],
125  vextq_s32(data.val[1], data.val[2], 1), six);
126 
127  out.val[1] = vmlaq_s32(out.val[1],
128  vextq_s32(data.val[1], data.val[2], 2), fifteen);
129 
130  out.val[1] = vmlaq_s32(out.val[1],
131  vextq_s32(data.val[1], data.val[2], 3), twenty);
132 
133  out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
134 
135  out.val[1] = vmlaq_s32(out.val[1],
136  vextq_s32(data.val[2], data.val[3], 1), six);
137 
138  out.val[1] = vaddq_s32(out.val[1],
139  vextq_s32(data.val[2], data.val[3], 2));
140 
141  return out;
142 }
143 } // namespace
144 
146  : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
147 {
148 }
149 
151 {
152  return _border_size;
153 }
154 
155 void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
156 {
158  ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
159 
160  _run_sobel_x = output_x != nullptr;
161  _run_sobel_y = output_y != nullptr;
162 
163  if(_run_sobel_x)
164  {
166  }
167 
168  if(_run_sobel_y)
169  {
171  }
172 
173  _input = input;
174  _output_x = output_x;
175  _output_y = output_y;
176  _border_size = BorderSize(border_undefined ? 0 : 3, 3);
177 
178  // Configure kernel window
179  constexpr unsigned int num_elems_processed_per_iteration = 8;
180  constexpr unsigned int num_elems_read_per_iteration = 16;
181  constexpr unsigned int num_elems_written_per_iteration = 8;
182 
183  Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
184  AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
185  AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
186 
188  AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
189  output_x_access,
190  output_y_access);
191 
192  output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
193  output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
194 
195  INEKernel::configure(win);
196 }
197 
199 {
200  ARM_COMPUTE_UNUSED(info);
203 
204  Iterator input(_input, window);
205  Iterator output_x;
206  Iterator output_y;
207 
208  if(_run_sobel_x)
209  {
210  output_x = Iterator(_output_x, window);
211  }
212 
213  if(_run_sobel_y)
214  {
215  output_y = Iterator(_output_y, window);
216  }
217 
218  if(_run_sobel_y && _run_sobel_x)
219  {
220  execute_window_loop(window, [&](const Coordinates &)
221  {
222  const uint8x16_t data = vld1q_u8(input.ptr() - 3);
223 
224  const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
225  const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
226 
227  const int32x4x4_t data_s32 =
228  {
229  {
230  vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
231  vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
232  vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
233  vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
234  }
235  };
236 
237  const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
238  vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
239  vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
240 
241  const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
242  vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
243  vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
244  },
245  input, output_x, output_y);
246  }
247  else if(_run_sobel_x)
248  {
249  execute_window_loop(window, [&](const Coordinates &)
250  {
251  const uint8x16_t data = vld1q_u8(input.ptr() - 3);
252 
253  const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
254  const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
255 
256  const int32x4x4_t data_s32 =
257  {
258  {
259  vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
260  vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
261  vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
262  vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
263  }
264  };
265 
266  const int32x4x2_t out = compute_hor_sobel_x(data_s32);
267  vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
268  vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
269  },
270  input, output_x);
271  }
272  else if(_run_sobel_y)
273  {
274  execute_window_loop(window, [&](const Coordinates &)
275  {
276  const uint8x16_t data = vld1q_u8(input.ptr() - 3);
277 
278  const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data));
279  const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
280 
281  const int32x4x4_t data_s32 =
282  {
283  {
284  vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
285  vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
286  vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
287  vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
288  }
289  };
290 
291  const int32x4x2_t out = compute_hor_sobel_y(data_s32);
292  vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
293  vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
294  },
295  input, output_y);
296  }
297 }
298 
300  : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
301 {
302 }
303 
305 {
306  return BorderSize{ 3, 0 };
307 }
308 
309 void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
310 {
311  ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
312 
313  _run_sobel_x = (output_x != nullptr);
314  _run_sobel_y = (output_y != nullptr);
315 
316  if(_run_sobel_x)
317  {
320  }
321 
322  if(_run_sobel_y)
323  {
326  }
327 
328  _input_x = input_x;
329  _input_y = input_y;
330  _output_x = output_x;
331  _output_y = output_y;
332 
333  const ITensor *const input = _run_sobel_x ? input_x : input_y;
334 
335  // Configure kernel window
336  constexpr unsigned int num_elems_processed_per_iteration = 8;
337  constexpr unsigned int num_elems_read_per_iteration = 8;
338  constexpr unsigned int num_elems_written_per_iteration = 8;
339  constexpr unsigned int num_rows_read_per_iteration = 7;
340 
341  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
342  AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
343  AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
344 
346  AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
347  AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
348  output_x_access,
349  output_y_access);
350 
351  output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
352  output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
353 
354  INEKernel::configure(win);
355 }
356 
358 {
359  ARM_COMPUTE_UNUSED(info);
362 
363  Iterator input_x;
364  Iterator input_y;
365  Iterator output_x;
366  Iterator output_y;
367 
368  int32_t in_x_stride = 0;
369  int32_t in_y_stride = 0;
370 
371  if(_run_sobel_x)
372  {
373  input_x = Iterator(_input_x, window);
374  output_x = Iterator(_output_x, window);
375  in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
376  }
377 
378  if(_run_sobel_y)
379  {
380  input_y = Iterator(_input_y, window);
381  output_y = Iterator(_output_y, window);
382  in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
383  }
384 
385  if(_run_sobel_x)
386  {
387  execute_window_loop(window, [&](const Coordinates &)
388  {
389  auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
390 
391  //top3
392  int32x4x2_t data =
393  {
394  {
395  vld1q_s32(in_ptr),
396  vld1q_s32(in_ptr + 4)
397  }
398  };
399 
400  int32x4x2_t out = data;
401 
402  //top2
403  in_ptr += in_x_stride;
404  data.val[0] = vld1q_s32(in_ptr);
405  out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
406 
407  data.val[1] = vld1q_s32(in_ptr + 4);
408  out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
409 
410  //top
411  in_ptr += in_x_stride;
412  data.val[0] = vld1q_s32(in_ptr);
413  out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
414 
415  data.val[1] = vld1q_s32(in_ptr + 4);
416  out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
417 
418  //mid
419  in_ptr += in_x_stride;
420  data.val[0] = vld1q_s32(in_ptr);
421  out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty);
422 
423  data.val[1] = vld1q_s32(in_ptr + 4);
424  out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty);
425 
426  //low
427  in_ptr += in_x_stride;
428  data.val[0] = vld1q_s32(in_ptr);
429  out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen);
430 
431  data.val[1] = vld1q_s32(in_ptr + 4);
432  out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen);
433 
434  //low2
435  in_ptr += in_x_stride;
436  data.val[0] = vld1q_s32(in_ptr);
437  out.val[0] = vmlaq_s32(out.val[0], data.val[0], six);
438 
439  data.val[1] = vld1q_s32(in_ptr + 4);
440  out.val[1] = vmlaq_s32(out.val[1], data.val[1], six);
441 
442  //low3
443  in_ptr += in_x_stride;
444  data.val[0] = vld1q_s32(in_ptr);
445  out.val[0] = vaddq_s32(out.val[0], data.val[0]);
446 
447  data.val[1] = vld1q_s32(in_ptr + 4);
448  out.val[1] = vaddq_s32(out.val[1], data.val[1]);
449 
450  vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
451  vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
452  },
453  input_x, output_x);
454  }
455 
456  if(_run_sobel_y)
457  {
458  execute_window_loop(window, [&](const Coordinates &)
459  {
460  auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
461 
462  //top3
463  int32x4x2_t data =
464  {
465  {
466  vld1q_s32(in_ptr),
467  vld1q_s32(in_ptr + 4)
468  }
469  };
470 
471  int32x4x2_t out =
472  {
473  {
474  vnegq_s32(data.val[0]),
475  vnegq_s32(data.val[1])
476  }
477  };
478 
479  //top2
480  in_ptr += in_y_stride;
481  data.val[0] = vld1q_s32(in_ptr);
482  out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour);
483 
484  data.val[1] = vld1q_s32(in_ptr + 4);
485  out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour);
486 
487  //top
488  in_ptr += in_y_stride;
489  data.val[0] = vld1q_s32(in_ptr);
490  out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive);
491 
492  data.val[1] = vld1q_s32(in_ptr + 4);
493  out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive);
494 
495  //low
496  in_ptr += (2 * in_y_stride);
497  data.val[0] = vld1q_s32(in_ptr);
498  out.val[0] = vmlaq_s32(out.val[0], data.val[0], five);
499 
500  data.val[1] = vld1q_s32(in_ptr + 4);
501  out.val[1] = vmlaq_s32(out.val[1], data.val[1], five);
502 
503  //low2
504  in_ptr += in_y_stride;
505  data.val[0] = vld1q_s32(in_ptr);
506  out.val[0] = vmlaq_s32(out.val[0], data.val[0], four);
507 
508  data.val[1] = vld1q_s32(in_ptr + 4);
509  out.val[1] = vmlaq_s32(out.val[1], data.val[1], four);
510 
511  //low3
512  in_ptr += in_y_stride;
513  data.val[0] = vld1q_s32(in_ptr);
514  out.val[0] = vaddq_s32(out.val[0], data.val[0]);
515 
516  data.val[1] = vld1q_s32(in_ptr + 4);
517  out.val[1] = vaddq_s32(out.val[1], data.val[1]);
518 
519  vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
520  vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
521  },
522  input_y, output_y);
523  }
524 }
unsigned int top
top of the border
Definition: Types.h:375
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
BorderSize border_size() const override
The size of the border for that kernel.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Container for 2D border size.
Definition: Types.h:273
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t,...)
Definition: Validate.h:643
Interface for Neon tensor.
Definition: ITensor.h:36
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
1 channel, 1 S32 per channel
Implementation of a rectangular access pattern.
virtual Format format() const =0
Colour format of the image.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel&#39;s source, destination and border mode.
NESobel7x7HorKernel()
Default constructor.
NESobel7x7VertKernel()
Default constructor.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
unsigned int left
left of the border
Definition: Types.h:378
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
size_t pixel_size_from_format(Format format)
The size in bytes of the pixel format.
Definition: Utils.h:146
void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel&#39;s source, destination and border mode.
unsigned int num_elems_processed_per_iteration
BorderSize border_size() const override
The size of the border for that kernel.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205