41 :
INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false)
47 constexpr
size_t PERMUTATIONS = 16;
48 constexpr
size_t PERM_SIZE = 16;
50 inline uint8x8x2_t create_permutation_index(
size_t k)
54 static const std::array<std::array<uint8_t, PERMUTATIONS>, PERM_SIZE> permutations_table{ { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
55 { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
56 { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
57 { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
58 { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
59 { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
60 { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
61 { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
62 { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
63 { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
64 { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
65 { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
66 { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
67 { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
68 { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
69 { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
73 const uint8x8x2_t index =
76 vld1_u8(permutations_table[k].data()),
77 vld1_u8(permutations_table[k].data() + 8)
84 inline uint8x8x4_t create_circle_index_register()
115 static const std::array<uint8_t, 8> top_right =
133 static const std::array<uint8_t, 8> bottom_right =
150 static const std::array<uint8_t, 8> top_left =
168 static const std::array<uint8_t, 8> bottom_left =
185 const uint8x8x4_t reg =
188 vld1_u8(top_right.data()),
189 vld1_u8(bottom_right.data()),
190 vld1_u8(top_left.data()),
191 vld1_u8(bottom_left.data())
198 inline uint8x16_t get_circle_texels(
const uint8x8x4_t &index,
const uint8x8x4_t &tbl_hi,
const uint8x8x3_t &tbl_lo)
205 return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]),
206 vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3]));
209 inline uint8x16_t get_permutation_texels(
const uint8x8x2_t &permutation_index,
const uint8x8x2_t &tbl_circle)
243 static const uint8x8_t perm_right = vdup_n_u8(255);
245 return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]),
246 vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1]));
249 inline bool is_permutation_brighter(
const uint8x16_t &permutation,
const uint8x16_t &pg)
251 const uint8x16_t res_gt = vcgtq_u8(permutation, pg);
253 return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits<uint64_t>::max();
256 inline bool is_permutation_darker(
const uint8x16_t &permutation,
const uint8x16_t &pl)
258 const uint8x16_t res_lt = vcltq_u8(permutation, pl);
259 const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt);
260 const uint64_t t3 = vgetq_lane_u64(u64res_lt, 0);
261 const uint64_t t4 = vgetq_lane_u64(u64res_lt, 1);
263 return std::numeric_limits<uint64_t>::max() == t3 && 255 == t4;
266 inline bool is_permutation_corner(
const uint8x16_t &permutation,
const uint8x16_t &pg,
const uint8x16_t &pl)
268 return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
271 inline bool point_is_fast_corner(uint8_t p, uint8_t
threshold,
const uint8x8x2_t &tbl_circle_texels, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
276 uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
277 uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
279 bool corner_detected =
false;
281 for(
size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j)
283 const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels);
284 corner_detected = is_permutation_corner(pe_texels, pg, pl);
287 return corner_detected;
290 inline uint8x8x2_t create_circle_tbl(
const std::array<uint8_t *const __restrict, 7> &buffer,
size_t in_offset,
const uint8x8x4_t &circle_index_r)
300 const uint8x8x4_t tbl_window_hi =
303 vld1_u8(buffer[0] + in_offset),
304 vld1_u8(buffer[1] + in_offset),
305 vld1_u8(buffer[2] + in_offset),
306 vld1_u8(buffer[3] + in_offset)
310 const uint8x8x3_t tbl_window_lo =
313 vld1_u8(buffer[4] + in_offset),
314 vld1_u8(buffer[5] + in_offset),
315 vld1_u8(buffer[6] + in_offset)
319 const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo);
321 const uint8x8x2_t tbl_circle_texels =
324 vget_low_u8(circle_texels),
325 vget_high_u8(circle_texels)
329 return tbl_circle_texels;
332 inline uint8_t get_point_score(uint8_t p, uint8_t tolerance,
const uint8x8x2_t &tbl_circle, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
335 uint8_t a = tolerance;
339 const uint16_t ab = a +
b;
340 const uint8_t c = ab >> 1;
342 if(point_is_fast_corner(p, c, tbl_circle, perm_indices))
375 constexpr
unsigned int num_elems_read_per_iteration = 8;
376 constexpr
unsigned int num_elems_written_per_iteration = 1;
377 constexpr
unsigned int num_rows_read_per_iteration = 7;
388 INEKernel::configure(win);
397 std::array<uint8x8x2_t, PERMUTATIONS> perm_index{ {} };
402 const uint8x8x4_t circle_index_r = create_circle_index_register();
406 for(
size_t k = 0; k < PERMUTATIONS; ++k)
408 perm_index[k] = create_permutation_index(k);
414 const std::array<uint8_t *const __restrict, 7> in_row
425 auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t
b)
427 const bool p_is_in_ab = (a <= p) && (p <=
b);
428 const bool q_is_in_ab = (a <= q) && (q <=
b);
429 return p_is_in_ab && q_is_in_ab;
434 const size_t in_offset = in.
offset();
435 const uint8_t p0 = *in.
ptr();
436 const uint8_t
b = std::min(p0 + _threshold, 255);
437 const uint8_t a = std::max(p0 - _threshold, 0);
444 const uint8_t p1 = (in_offset + in_row[0])[3];
445 const uint8_t p9 = (in_offset + in_row[6])[3];
447 if(!is_rejected(p1, p9, a, b))
450 const uint8_t p5 = (in_offset + in_row[3])[6];
451 const uint8_t p13 = (in_offset + in_row[3])[0];
453 if(!is_rejected(p5, p13, a, b))
456 const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
458 if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index))
460 if(_non_max_suppression)
462 score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index);
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Container for 2D border size.
Common interface for all kernels implemented in C++.
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Class to describe a number of elements in each dimension.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
BorderSize border_size() const override
The size of the border for that kernel.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
__kernel void non_max_suppression(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This function performs Non maxima suppression over a 3x3 window on a given image. ...
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
Initialise the kernel.
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
NEFastCornersKernel()
Constructor.
Iterator updated by execute_window_loop for each window element.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
SimpleTensor< T > threshold(const SimpleTensor< T > &src, T threshold, T false_value, T true_value, ThresholdType type, T upper)
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)