Compute Library
 21.02
NESobel5x5Kernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
30 #include "arm_compute/core/Types.h"
35 
36 #include <arm_neon.h>
37 #include <cstddef>
38 #include <cstdint>
39 
40 using namespace arm_compute;
41 
43  : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
44 {
45 }
46 
48 {
49  return _border_size;
50 }
51 
52 void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
53 {
55  ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
56 
57  _run_sobel_x = output_x != nullptr;
58  _run_sobel_y = output_y != nullptr;
59 
60  if(_run_sobel_x)
61  {
63  }
64 
65  if(_run_sobel_y)
66  {
68  }
69 
70  _input = input;
71  _output_x = output_x;
72  _output_y = output_y;
73  _border_size = BorderSize(border_undefined ? 0 : 2, 2);
74 
75  // Configure kernel window
76  constexpr unsigned int num_elems_processed_per_iteration = 8;
77  constexpr unsigned int num_elems_read_per_iteration = 16;
78  constexpr unsigned int num_elems_written_per_iteration = 8;
79 
80  Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
81  AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
82  AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
83 
85  AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
86  output_x_access,
87  output_y_access);
88 
89  output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
90  output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
91 
92  INEKernel::configure(win);
93 }
94 
96 {
97  ARM_COMPUTE_UNUSED(info);
100 
101  Window win_in(window);
102  win_in.shift(Window::DimX, -2);
103 
104  Iterator input(_input, win_in);
105  Iterator output_x;
106  Iterator output_y;
107 
108  if(_run_sobel_x)
109  {
110  output_x = Iterator(_output_x, window);
111  }
112 
113  if(_run_sobel_y)
114  {
115  output_y = Iterator(_output_y, window);
116  }
117 
118  if(_run_sobel_y && _run_sobel_x)
119  {
120  static const int16x8_t six = vdupq_n_s16(6);
121  static const int16x8_t four = vdupq_n_s16(4);
122  static const int16x8_t two = vdupq_n_s16(2);
123  static const int16x8_t minustwo = vdupq_n_s16(-2);
124 
125  execute_window_loop(window, [&](const Coordinates &)
126  {
127  const uint8x16_t data = vld1q_u8(input.ptr());
128 
129  const int16x8x2_t data_s16 =
130  {
131  {
132  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
133  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
134  }
135  };
136 
137  int16x8_t out_y = data_s16.val[0];
138  out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
139  out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
140  out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
141  out_y = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
142 
143  vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
144 
145  int16x8_t out_x = vnegq_s16(data_s16.val[0]);
146  out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
147  out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
148  out_x = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
149 
150  vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
151  },
152  input, output_x, output_y);
153  }
154  else if(_run_sobel_x)
155  {
156  static const int16x8_t two = vdupq_n_s16(2);
157  static const int16x8_t minustwo = vdupq_n_s16(-2);
158 
159  execute_window_loop(window, [&](const Coordinates &)
160  {
161  const uint8x16_t data = vld1q_u8(input.ptr());
162 
163  const int16x8x2_t data_s16 =
164  {
165  {
166  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
167  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
168  }
169  };
170 
171  int16x8_t out = vnegq_s16(data_s16.val[0]);
172  out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
173  out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
174  out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
175 
176  vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
177  },
178  input, output_x);
179  }
180  else if(_run_sobel_y)
181  {
182  static const int16x8_t six = vdupq_n_s16(6);
183  static const int16x8_t four = vdupq_n_s16(4);
184 
185  execute_window_loop(window, [&](const Coordinates &)
186  {
187  const uint8x16_t data = vld1q_u8(input.ptr());
188 
189  const int16x8x2_t data_s16 =
190  {
191  {
192  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
193  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
194  }
195  };
196 
197  int16x8_t out = data_s16.val[0];
198  out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
199  out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
200  out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
201  out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
202 
203  vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
204  },
205  input, output_y);
206  }
207 }
208 
210  : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
211 {
212 }
213 
215 {
216  return BorderSize{ 2, 0 };
217 }
218 
219 void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
220 {
221  ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
222 
223  _run_sobel_x = output_x != nullptr;
224  _run_sobel_y = output_y != nullptr;
225 
226  if(_run_sobel_x)
227  {
230  }
231 
232  if(_run_sobel_y)
233  {
236  }
237 
238  _input_x = input_x;
239  _input_y = input_y;
240  _output_x = output_x;
241  _output_y = output_y;
242 
243  const ITensor *const input = _run_sobel_x ? input_x : input_y;
244 
245  // Configure kernel window
246  constexpr unsigned int num_elems_processed_per_iteration = 16;
247  constexpr unsigned int num_elems_read_per_iteration = 16;
248  constexpr unsigned int num_elems_written_per_iteration = 16;
249  constexpr unsigned int num_rows_read_per_iteration = 5;
250 
251  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
252  AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
253  AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
254 
256  AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
257  AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
258  output_x_access,
259  output_y_access);
260 
261  output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
262  output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
263 
264  INEKernel::configure(win);
265 }
266 
268 {
269  ARM_COMPUTE_UNUSED(info);
272 
273  Iterator input_x;
274  Iterator input_y;
275  Iterator output_x;
276  Iterator output_y;
277 
278  const int16_t *input_x_low2_ptr = nullptr;
279  const int16_t *input_x_low_ptr = nullptr;
280  const int16_t *input_x_mid_ptr = nullptr;
281  const int16_t *input_x_top_ptr = nullptr;
282  const int16_t *input_x_top2_ptr = nullptr;
283 
284  const int16_t *input_y_low2_ptr = nullptr;
285  const int16_t *input_y_low_ptr = nullptr;
286  const int16_t *input_y_top_ptr = nullptr;
287  const int16_t *input_y_top2_ptr = nullptr;
288 
289  if(_run_sobel_x)
290  {
291  input_x = Iterator(_input_x, window);
292  output_x = Iterator(_output_x, window);
293  input_x_top2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -2)));
294  input_x_top_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -1)));
295  input_x_mid_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 0)));
296  input_x_low_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 1)));
297  input_x_low2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 2)));
298  }
299 
300  if(_run_sobel_y)
301  {
302  input_y = Iterator(_input_y, window);
303  output_y = Iterator(_output_y, window);
304  input_y_top2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -2)));
305  input_y_top_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -1)));
306  input_y_low_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 1)));
307  input_y_low2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 2)));
308  }
309 
310  static const int16x8_t six = vdupq_n_s16(6);
311  static const int16x8_t four = vdupq_n_s16(4);
312  static const int16x8_t two = vdupq_n_s16(2);
313  static const int16x8_t minustwo = vdupq_n_s16(-2);
314 
315  if(_run_sobel_x)
316  {
317  execute_window_loop(window, [&](const Coordinates &)
318  {
319  // Convert offset from uint8_t* to uint16_t*
320  const size_t input_offset_high_s16 = input_x.offset() / 2;
321  const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
322 
323  //HIGH DATA
324  //top2
325  int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
326  int16x8_t out_high = data_high;
327  //top
328  data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
329  out_high = vmlaq_s16(out_high, data_high, four);
330  //mid
331  data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
332  out_high = vmlaq_s16(out_high, data_high, six);
333  //low
334  data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
335  out_high = vmlaq_s16(out_high, data_high, four);
336  //low2
337  data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
338  out_high = vaddq_s16(out_high, data_high);
339 
340  vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())), out_high);
341 
342  //LOW DATA
343  //top2
344  int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
345  int16x8_t out_low = data_low;
346  //top
347  data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
348  out_low = vmlaq_s16(out_low, data_low, four);
349  //mid
350  data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
351  out_low = vmlaq_s16(out_low, data_low, six);
352  //low
353  data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
354  out_low = vmlaq_s16(out_low, data_low, four);
355  //low2
356  data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
357  out_low = vaddq_s16(out_low, data_low);
358 
359  vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())) + 8, out_low);
360  },
361  input_x, output_x);
362  }
363 
364  if(_run_sobel_y)
365  {
366  execute_window_loop(window, [&](const Coordinates &)
367  {
368  // Convert offset from uint8_t* to uint16_t*
369  const size_t input_offset_high_s16 = input_y.offset() / 2;
370  const size_t input_offset_low_s16 = input_offset_high_s16 + 8;
371 
372  //HIGH DATA
373  //top2
374  int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
375  int16x8_t out_high = vnegq_s16(data_high);
376  //top
377  data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
378  out_high = vmlaq_s16(out_high, data_high, minustwo);
379  //low
380  data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
381  out_high = vmlaq_s16(out_high, data_high, two);
382  //low2
383  data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
384  out_high = vaddq_s16(out_high, data_high);
385 
386  vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())), out_high);
387 
388  //LOW DATA
389  //top2
390  int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
391  int16x8_t out_low = vnegq_s16(data_low);
392  //top
393  data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
394  out_low = vmlaq_s16(out_low, data_low, minustwo);
395  //low
396  data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
397  out_low = vmlaq_s16(out_low, data_low, two);
398  //low2
399  data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
400  out_low = vaddq_s16(out_low, data_low);
401 
402  vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())) + 8, out_low);
403  },
404  input_y, output_y);
405  }
406 }
unsigned int top
top of the border
Definition: Types.h:375
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void shift(size_t dimension, int shift_value)
Shift the values of a given dimension by the given shift_value.
Definition: Window.inl:133
Container for 2D border size.
Definition: Types.h:273
1 channel, 1 U8 per channel
void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel&#39;s source, destination and border mode.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t,...)
Definition: Validate.h:643
Interface for Neon tensor.
Definition: ITensor.h:36
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
Implementation of a rectangular access pattern.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
BorderSize border_size() const override
The size of the border for that kernel.
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
NESobel5x5HorKernel()
Default constructor.
unsigned int left
left of the border
Definition: Types.h:378
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
NESobel5x5VertKernel()
Default constructor.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Definition: Helpers.inl:134
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
BorderSize border_size() const override
The size of the border for that kernel.
void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel&#39;s source, destination and border mode.
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205