Compute Library
 21.02
NEFastCornersKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
32 
33 #include <algorithm>
34 #include <arm_neon.h>
35 #include <cstddef>
36 #include <limits>
37 
38 using namespace arm_compute;
39 
41  : INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false)
42 {
43 }
44 
45 namespace
46 {
47 constexpr size_t PERMUTATIONS = 16;
48 constexpr size_t PERM_SIZE = 16;
49 
50 inline uint8x8x2_t create_permutation_index(size_t k)
51 {
52  ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
53 
54  static const std::array<std::array<uint8_t, PERMUTATIONS>, PERM_SIZE> permutations_table{ { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
55  { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
56  { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
57  { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
58  { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
59  { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
60  { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
61  { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
62  { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
63  { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
64  { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
65  { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
66  { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
67  { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
68  { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
69  { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
70 
71  } };
72 
73  const uint8x8x2_t index =
74  {
75  {
76  vld1_u8(permutations_table[k].data()),
77  vld1_u8(permutations_table[k].data() + 8)
78  }
79  };
80 
81  return index;
82 }
83 
84 inline uint8x8x4_t create_circle_index_register()
85 {
86  /*
87  This function creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
88 
89  . . F 0 1 . . .
90  . E . . . 2 . .
91  D . . . . . 3 .
92  C . . P . . 4 .
93  B . . . . . 5 .
94  . A . . . 6 . .
95  . . 9 8 7 . . .
96 
97  Where . is an irrelevant texel value
98 
99  We want to retrieve all texels [0,F]
100 
101  The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels()
102 
103  The first table holds the top 4 rows of texels
104  . . F 0 1 . . .
105  . E . . . 2 . .
106  D . . . . . 3 .
107  C . . P . . 4 .
108 
109  The second table the bottom 3 rows of texels
110  B . . . . . 5 .
111  . A . . . 6 . .
112  . . 9 8 7 . . .
113 
114  */
115  static const std::array<uint8_t, 8> top_right =
116  {
117  /* The register r.val[0] will be used to retrieve these texels:
118  . . . 0 1 . . .
119  . . . . . 2 . .
120  . . . . . . 3 .
121  . . . . . . 4 .
122  */
123  3 /* top table, first row, elem 4, value 0 in the diagram above */,
124  4 /* top table, first row, elem 5, value 1 in the diagram above */,
125  13 /* top table, second row, elem 6, value 2 in the diagram above */,
126  22 /* top table, third row, elem 7, value 3 in the diagram above*/,
127  30 /* top table, fourth row, elem 7, value 4 in the diagram above*/,
128  255,
129  255,
130  255
131  };
132 
133  static const std::array<uint8_t, 8> bottom_right =
134  {
135  /* The register r.val[1] will be used to retrieve these texels:
136  . . . . . . 5 .
137  . . . . . 6 . .
138  . . . . 7 . . .
139  */
140  255,
141  255,
142  255,
143  255,
144  255,
145  6 /* low table, first row, elem 7, value 5 in the diagram above*/,
146  13 /* low table, second row, elem 6, value 6 in the diagram above*/,
147  20 /* low table, third row, elem 5, value 7 in the diagram above*/
148  };
149 
150  static const std::array<uint8_t, 8> top_left =
151  {
152  /* The register r.val[2] will be used to retrieve these texels:
153  . . F . . . . .
154  . E . . . . . .
155  D . . . . . . .
156  C . . . . . . .
157  */
158  255,
159  255,
160  255,
161  255,
162  24 /* top table, fourth row, elem 1, value C in the diagram above */,
163  16 /* top table, third row, elem 1, value D in the diagram above*/,
164  9 /* top table, second row, elem 2, value E in the diagram above*/,
165  2 /* top table, first row, elem 3, value F in the diagram above*/
166  };
167 
168  static const std::array<uint8_t, 8> bottom_left =
169  {
170  /* The register r.val[3] will be used to retrieve these texels:
171  B . . . . . . .
172  . A . . . . . .
173  . . 9 8 . . . .
174  */
175  19 /* low table, third row, elem 4, value 8 in the diagram above */,
176  18 /* low table, third row, elem 3, value 9 in the diagram above */,
177  9 /* low table, second row, elem 2, value A in the diagram above */,
178  0 /* low table, first row, elem 1, value B in the diagram above */,
179  255,
180  255,
181  255,
182  255
183  };
184 
185  const uint8x8x4_t reg =
186  {
187  {
188  vld1_u8(top_right.data()),
189  vld1_u8(bottom_right.data()),
190  vld1_u8(top_left.data()),
191  vld1_u8(bottom_left.data())
192  }
193  };
194 
195  return reg;
196 }
197 
198 inline uint8x16_t get_circle_texels(const uint8x8x4_t &index, const uint8x8x4_t &tbl_hi, const uint8x8x3_t &tbl_lo)
199 {
200  /*
201  This function loads the 16 texels in the Bresenham circle of radius 3 into the register 'texels'.
202  The parameter 'index' is an array of indices which was previously setup in setup_circle_index_register().
203  tbl_hi and tbl_lo are the two tables holding the texels in the window [(-3,-3),(+3,+3)] for a given texel P
204  */
205  return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]),
206  vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3]));
207 }
208 
209 inline uint8x16_t get_permutation_texels(const uint8x8x2_t &permutation_index, const uint8x8x2_t &tbl_circle)
210 {
211  /*
212  This function stores the 9 texels of a give permutation X in the neon register 'texels'
213 
214  'tbl_circle' is a LUT with the texels 0 to F
215 
216  . . F 0 1 . . .
217  . E . . . 2 . .
218  D . . . . . 3 .
219  C . . P . . 4 .
220  B . . . . . 5 .
221  . A . . . 6 . .
222  . . 9 8 7 . . .
223 
224  'permutation_index' is one of the permutations below:
225 
226  { 0, 1, 2, 3, 4, 5, 6, 7, 8},
227  { F, 0, 1, 2, 3, 4, 5, 6, 7},
228  { E, F, 0, 1, 2, 3, 4, 5, 6},
229  { D, E, F, 0, 1, 2, 3, 4, 5},
230  { C, D, E, F, 0, 1, 2, 3, 4},
231  { B, C, D, E, F, 0, 1, 2, 3},
232  { A, B, C, D, E, F, 0, 1, 2},
233  { 9, A, B, C, D, E, F, 0, 1},
234  { 8, 9, A, B, C, D, E, F, 0},
235  { 7, 8, 9, A, B, C, D, E, F},
236  { 6, 7, 8, 9, A, B, C, D, E},
237  { 5, 6, 7, 8, 9, A, B, C, D},
238  { 4, 5, 6, 7, 8, 9, A, B, C},
239  { 3, 4, 5, 6, 7, 8, 9, A, B},
240  { 2, 3, 4, 5, 6, 7, 8, 9, A},
241  { 1, 2, 3, 4, 5, 6, 7, 8, 9},
242  */
243  static const uint8x8_t perm_right = vdup_n_u8(255); // init to 255 so that vtbx preserves the original values of the lanes
244 
245  return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]),
246  vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1]));
247 }
248 
249 inline bool is_permutation_brighter(const uint8x16_t &permutation, const uint8x16_t &pg)
250 {
251  const uint8x16_t res_gt = vcgtq_u8(permutation, pg);
252 
253  return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits<uint64_t>::max();
254 }
255 
256 inline bool is_permutation_darker(const uint8x16_t &permutation, const uint8x16_t &pl)
257 {
258  const uint8x16_t res_lt = vcltq_u8(permutation, pl);
259  const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt);
260  const uint64_t t3 = vgetq_lane_u64(u64res_lt, 0);
261  const uint64_t t4 = vgetq_lane_u64(u64res_lt, 1);
262 
263  return std::numeric_limits<uint64_t>::max() == t3 && 255 == t4;
264 }
265 
266 inline bool is_permutation_corner(const uint8x16_t &permutation, const uint8x16_t &pg, const uint8x16_t &pl)
267 {
268  return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
269 }
270 
271 inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
272 {
273  /*
274  This function determines whether the point 'p' is a corner.
275  */
276  uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
277  uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
278 
279  bool corner_detected = false;
280 
281  for(size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j)
282  {
283  const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels);
284  corner_detected = is_permutation_corner(pe_texels, pg, pl);
285  }
286 
287  return corner_detected;
288 }
289 
290 inline uint8x8x2_t create_circle_tbl(const std::array<uint8_t *const __restrict, 7> &buffer, size_t in_offset, const uint8x8x4_t &circle_index_r)
291 {
292  /*
293  This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
294  circle_index_r is a vector of 4 registers to retrieve the texels from the two tables mentioned above.
295  */
296 
297  //Load the texels in the window [(x-3,y-3),(x+3,y+3)].
298  //The top 4 rows are loaded in tbl_hi and the low 3 rows in tbl_lo.
299  //These two tables are then used to retrieve the texels in the Bresenham circle of radius 3.
300  const uint8x8x4_t tbl_window_hi =
301  {
302  {
303  vld1_u8(buffer[0] + in_offset),
304  vld1_u8(buffer[1] + in_offset),
305  vld1_u8(buffer[2] + in_offset),
306  vld1_u8(buffer[3] + in_offset)
307  }
308  };
309 
310  const uint8x8x3_t tbl_window_lo =
311  {
312  {
313  vld1_u8(buffer[4] + in_offset),
314  vld1_u8(buffer[5] + in_offset),
315  vld1_u8(buffer[6] + in_offset)
316  }
317  };
318 
319  const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo);
320 
321  const uint8x8x2_t tbl_circle_texels =
322  {
323  {
324  vget_low_u8(circle_texels),
325  vget_high_u8(circle_texels)
326  }
327  };
328 
329  return tbl_circle_texels;
330 }
331 
332 inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
333 {
334  uint8_t b = 255;
335  uint8_t a = tolerance;
336 
337  while(b - a > 1)
338  {
339  const uint16_t ab = a + b;
340  const uint8_t c = ab >> 1;
341 
342  if(point_is_fast_corner(p, c, tbl_circle, perm_indices))
343  {
344  a = c;
345  }
346  else
347  {
348  b = c;
349  }
350  }
351 
352  return a;
353 }
354 } // namespace
355 
357 {
358  return BorderSize(3);
359 }
360 
361 void NEFastCornersKernel::configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
362 {
367  ARM_COMPUTE_ERROR_ON_MSG(border_undefined == false, "Not implemented");
368 
369  _input = input;
370  _output = output;
371  _threshold = threshold;
372  _non_max_suppression = non_max_suppression;
373 
374  constexpr unsigned int num_elems_processed_per_iteration = 1;
375  constexpr unsigned int num_elems_read_per_iteration = 8;
376  constexpr unsigned int num_elems_written_per_iteration = 1;
377  constexpr unsigned int num_rows_read_per_iteration = 7;
378 
379  // Configure kernel window
380  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
381  AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
382  AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
383 
384  update_window_and_padding(win, input_access, output_access);
385 
386  output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
387 
388  INEKernel::configure(win);
389 }
390 
392 {
393  ARM_COMPUTE_UNUSED(info);
396 
397  std::array<uint8x8x2_t, PERMUTATIONS> perm_index{ {} };
398  /*
399  We use a LUT loaded with 7 rows of uint8_t from the input image [-3,-3]...[+3,+3] to retrieve the texels in the Brensenham circle radius 3 and put them in one neon register uint8x16_t.
400  The three lines below setup the neon index registers to get these texels out from the table
401  */
402  const uint8x8x4_t circle_index_r = create_circle_index_register();
403  /*
404  We put the 16 texels (circle) in a LUT to easily generate all the permutations. The for block below setups the indices for each permutation.
405  */
406  for(size_t k = 0; k < PERMUTATIONS; ++k)
407  {
408  perm_index[k] = create_permutation_index(k);
409  }
410 
411  Iterator in(_input, window);
412  Iterator out(_output, window);
413 
414  const std::array<uint8_t *const __restrict, 7> in_row
415  {
416  _input->ptr_to_element(Coordinates(-3, -3)),
417  _input->ptr_to_element(Coordinates(-3, -2)),
418  _input->ptr_to_element(Coordinates(-3, -1)),
419  _input->ptr_to_element(Coordinates(-3, 0)),
420  _input->ptr_to_element(Coordinates(-3, 1)),
421  _input->ptr_to_element(Coordinates(-3, 2)),
422  _input->ptr_to_element(Coordinates(-3, 3))
423  };
424 
425  auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t b)
426  {
427  const bool p_is_in_ab = (a <= p) && (p <= b);
428  const bool q_is_in_ab = (a <= q) && (q <= b);
429  return p_is_in_ab && q_is_in_ab;
430  };
431 
432  execute_window_loop(window, [&](const Coordinates &)
433  {
434  const size_t in_offset = in.offset();
435  const uint8_t p0 = *in.ptr();
436  const uint8_t b = std::min(p0 + _threshold, 255);
437  const uint8_t a = std::max(p0 - _threshold, 0);
438  uint8_t score = 0;
439  /*
440  Fast check to discard points which cannot be corners and avoid the expensive computation of the potential 16 permutations
441 
442  pixels 1 and 9 are examined, if both I1 and I9 are within [Ip - t, Ip + t], then candidate p is not a corner.
443  */
444  const uint8_t p1 = (in_offset + in_row[0])[3];
445  const uint8_t p9 = (in_offset + in_row[6])[3];
446 
447  if(!is_rejected(p1, p9, a, b))
448  {
449  /* pixels 5 and 13 are further examined to check whether three of them are brighter than Ip + t or darker than Ip - t */
450  const uint8_t p5 = (in_offset + in_row[3])[6];
451  const uint8_t p13 = (in_offset + in_row[3])[0];
452 
453  if(!is_rejected(p5, p13, a, b))
454  {
455  /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
456  const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
457 
458  if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index))
459  {
460  if(_non_max_suppression)
461  {
462  score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index);
463  }
464  else
465  {
466  score = 1;
467  }
468  }
469  }
470  }
471 
472  *out.ptr() = score;
473  },
474  in, out);
475 }
unsigned int top
top of the border
Definition: Types.h:375
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Definition: Validate.h:856
SimpleTensor< float > b
Definition: DFT.cpp:157
Container for 2D border size.
Definition: Types.h:273
Common interface for all kernels implemented in C++.
Definition: ICPPKernel.h:38
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
unsigned int left
left of the border
Definition: Types.h:378
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
BorderSize border_size() const override
The size of the border for that kernel.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
__kernel void non_max_suppression(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This function performs Non maxima suppression over a 3x3 window on a given image. ...
Definition: nonmax.cl:41
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
Initialise the kernel.
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Definition: Helpers.inl:134
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
SimpleTensor< T > threshold(const SimpleTensor< T > &src, T threshold, T false_value, T true_value, ThresholdType type, T upper)
Definition: Threshold.cpp:35
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205