Compute Library
 21.02
NEHarrisCornersKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
30 #include "arm_compute/core/Types.h"
31 #include "arm_compute/core/Utils.h"
36 
37 #include <algorithm>
38 #include <arm_neon.h>
39 #include <cmath>
40 #include <cstddef>
41 
42 using namespace arm_compute;
43 
50 
51 namespace
52 {
53 inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
54 {
55  // Trace^2
56  float32x4_t trace2 = vaddq_f32(gx2, gy2);
57  trace2 = vmulq_f32(trace2, trace2);
58 
59  // Det(A)
60  float32x4_t det = vmulq_f32(gx2, gy2);
61  det = vmlsq_f32(det, gxgy, gxgy);
62 
63  // Det(A) - sensitivity * trace^2
64  const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
65 
66  // mc > strength_thresh
67  const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
68 
69  return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
70 }
71 
72 inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
73  float32x4_t norm_factor)
74 {
75  // Normalize
76  low_gx = vmulq_f32(low_gx, norm_factor);
77  low_gy = vmulq_f32(low_gy, norm_factor);
78  high_gx = vmulq_f32(high_gx, norm_factor);
79  high_gy = vmulq_f32(high_gy, norm_factor);
80 
81  const float32x4_t l_gx = low_gx;
82  const float32x4_t l_gy = low_gy;
83  const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
84  const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
85  const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
86  const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
87 
88  // Gx*Gx
89  gx2 = vmlaq_f32(gx2, l_gx, l_gx);
90  gx2 = vmlaq_f32(gx2, m_gx, m_gx);
91  gx2 = vmlaq_f32(gx2, r_gx, r_gx);
92 
93  // Gy*Gy
94  gy2 = vmlaq_f32(gy2, l_gy, l_gy);
95  gy2 = vmlaq_f32(gy2, m_gy, m_gy);
96  gy2 = vmlaq_f32(gy2, r_gy, r_gy);
97 
98  // Gx*Gy
99  gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
100  gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
101  gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
102 }
103 
104 inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
105  float32x4_t norm_factor)
106 {
107  // Normalize
108  low_gx = vmulq_f32(low_gx, norm_factor);
109  low_gy = vmulq_f32(low_gy, norm_factor);
110  high_gx = vmulq_f32(high_gx, norm_factor);
111  high_gy = vmulq_f32(high_gy, norm_factor);
112 
113  // L2 values
114  float32x4_t gx = low_gx;
115  float32x4_t gy = low_gy;
116 
117  // Accumulate
118  gx2 = vmlaq_f32(gx2, gx, gx);
119  gy2 = vmlaq_f32(gy2, gy, gy);
120  gxgy = vmlaq_f32(gxgy, gx, gy);
121 
122  // L1 values
123  gx = vextq_f32(low_gx, high_gx, 1);
124  gy = vextq_f32(low_gy, high_gy, 1);
125 
126  // Accumulate
127  gx2 = vmlaq_f32(gx2, gx, gx);
128  gy2 = vmlaq_f32(gy2, gy, gy);
129  gxgy = vmlaq_f32(gxgy, gx, gy);
130 
131  // M values
132  gx = vextq_f32(low_gx, high_gx, 2);
133  gy = vextq_f32(low_gy, high_gy, 2);
134 
135  // Accumulate
136  gx2 = vmlaq_f32(gx2, gx, gx);
137  gy2 = vmlaq_f32(gy2, gy, gy);
138  gxgy = vmlaq_f32(gxgy, gx, gy);
139 
140  // R1 values
141  gx = vextq_f32(low_gx, high_gx, 3);
142  gy = vextq_f32(low_gy, high_gy, 3);
143 
144  // Accumulate
145  gx2 = vmlaq_f32(gx2, gx, gx);
146  gy2 = vmlaq_f32(gy2, gy, gy);
147  gxgy = vmlaq_f32(gxgy, gx, gy);
148 
149  // R2 values
150  gx = high_gx;
151  gy = high_gy;
152 
153  // Accumulate
154  gx2 = vmlaq_f32(gx2, gx, gx);
155  gy2 = vmlaq_f32(gy2, gy, gy);
156  gxgy = vmlaq_f32(gxgy, gx, gy);
157 }
158 
159 inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
160  float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
161 {
162  // Normalize
163  low_gx = vmulq_f32(low_gx, norm_factor);
164  low_gy = vmulq_f32(low_gy, norm_factor);
165  high_gx = vmulq_f32(high_gx, norm_factor);
166  high_gy = vmulq_f32(high_gy, norm_factor);
167 
168  // L3 values
169  float32x4_t gx = low_gx;
170  float32x4_t gy = low_gy;
171 
172  // Accumulate
173  gx2 = vmlaq_f32(gx2, gx, gx);
174  gy2 = vmlaq_f32(gy2, gy, gy);
175  gxgy = vmlaq_f32(gxgy, gx, gy);
176 
177  // L2 values
178  gx = vextq_f32(low_gx, high_gx, 1);
179  gy = vextq_f32(low_gy, high_gy, 1);
180 
181  // Accumulate
182  gx2 = vmlaq_f32(gx2, gx, gx);
183  gy2 = vmlaq_f32(gy2, gy, gy);
184  gxgy = vmlaq_f32(gxgy, gx, gy);
185 
186  // L1 values
187  gx = vextq_f32(low_gx, high_gx, 2);
188  gy = vextq_f32(low_gy, high_gy, 2);
189 
190  // Accumulate
191  gx2 = vmlaq_f32(gx2, gx, gx);
192  gy2 = vmlaq_f32(gy2, gy, gy);
193  gxgy = vmlaq_f32(gxgy, gx, gy);
194 
195  // M values
196  gx = vextq_f32(low_gx, high_gx, 3);
197  gy = vextq_f32(low_gy, high_gy, 3);
198 
199  // Accumulate
200  gx2 = vmlaq_f32(gx2, gx, gx);
201  gy2 = vmlaq_f32(gy2, gy, gy);
202  gxgy = vmlaq_f32(gxgy, gx, gy);
203 
204  // R1 values
205  gx = high_gx;
206  gy = high_gy;
207 
208  // Accumulate
209  gx2 = vmlaq_f32(gx2, gx, gx);
210  gy2 = vmlaq_f32(gy2, gy, gy);
211  gxgy = vmlaq_f32(gxgy, gx, gy);
212 
213  // Change tmp_low and tmp_high for calculating R2 and R3 values
214  low_gx = high_gx;
215  low_gy = high_gy;
216  high_gx = high_gx1;
217  high_gy = high_gy1;
218 
219  // Normalize
220  high_gx = vmulq_f32(high_gx, norm_factor);
221  high_gy = vmulq_f32(high_gy, norm_factor);
222 
223  // R2 values
224  gx = vextq_f32(low_gx, high_gx, 1);
225  gy = vextq_f32(low_gy, high_gy, 1);
226 
227  // Accumulate
228  gx2 = vmlaq_f32(gx2, gx, gx);
229  gy2 = vmlaq_f32(gy2, gy, gy);
230  gxgy = vmlaq_f32(gxgy, gx, gy);
231 
232  // R3 values
233  gx = vextq_f32(low_gx, high_gx, 2);
234  gy = vextq_f32(low_gy, high_gy, 2);
235 
236  // Accumulate
237  gx2 = vmlaq_f32(gx2, gx, gx);
238  gy2 = vmlaq_f32(gy2, gy, gy);
239  gxgy = vmlaq_f32(gxgy, gx, gy);
240 }
241 
242 inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
243  float in_norm_factor, float in_sensitivity, float in_strength_thresh)
244 
245 {
246  const auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
247  const auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
248  const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
249  const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
250  const auto output = static_cast<float *__restrict>(output_ptr);
251 
252  // Gx^2, Gy^2 and Gx*Gy
253  float32x4x2_t gx2 =
254  {
255  {
256  vdupq_n_f32(0.0f),
257  vdupq_n_f32(0.0f)
258  }
259  };
260  float32x4x2_t gy2 =
261  {
262  {
263  vdupq_n_f32(0.0f),
264  vdupq_n_f32(0.0f)
265  }
266  };
267  float32x4x2_t gxgy =
268  {
269  {
270  vdupq_n_f32(0.0f),
271  vdupq_n_f32(0.0f)
272  }
273  };
274 
275  // Row0
276  int16x8x2_t tmp_gx =
277  {
278  {
279  vld1q_s16(gx_ptr_0 - input_stride),
280  vld1q_s16(gx_ptr_1 - input_stride)
281  }
282  };
283  int16x8x2_t tmp_gy =
284  {
285  {
286  vld1q_s16(gy_ptr_0 - input_stride),
287  vld1q_s16(gy_ptr_1 - input_stride)
288  }
289  };
290  float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
291  float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
292  float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
293 
294  float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
295  float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
296  float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
297  float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
298  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
299 
300  low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
301  low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
302  high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
303  high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
304  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
305 
306  // Row1
307  tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
308  tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
309  tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
310  tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
311 
312  low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
313  low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
314  high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
315  high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
316  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
317 
318  low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
319  low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
320  high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
321  high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
322  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
323 
324  // Row2
325  tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
326  tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
327  tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
328  tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
329 
330  low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
331  low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
332  high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
333  high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
334  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
335 
336  low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
337  low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
338  high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
339  high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
340  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
341 
342  // Calculate harris score
343  const float32x4x2_t mc =
344  {
345  {
346  harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
347  harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
348  }
349  };
350 
351  // Store score
352  vst1q_f32(output + 0, mc.val[0]);
353  vst1q_f32(output + 4, mc.val[1]);
354 }
355 
356 inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
357  float in_norm_factor, float in_sensitivity, float in_strength_thresh)
358 {
359  auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
360  auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
361  const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
362  const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
363  const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
364  const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
365  const auto output = static_cast<float *__restrict>(output_ptr);
366  float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
367  float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
368  float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
369 
370  // Gx^2, Gy^2 and Gx*Gy
371  float32x4x2_t gx2 =
372  {
373  {
374  vdupq_n_f32(0.0f),
375  vdupq_n_f32(0.0f)
376  }
377  };
378  float32x4x2_t gy2 =
379  {
380  {
381  vdupq_n_f32(0.0f),
382  vdupq_n_f32(0.0f)
383  }
384  };
385  float32x4x2_t gxgy =
386  {
387  {
388  vdupq_n_f32(0.0f),
389  vdupq_n_f32(0.0f)
390  }
391  };
392 
393  // Row0
394  float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
395  float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
396  float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
397  float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
398  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
399 
400  low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
401  low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
402  high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
403  high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
404  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
405 
406  // Row1
407  low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
408  low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
409  high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
410  high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
411  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
412 
413  low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
414  low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
415  high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
416  high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
417  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
418 
419  // Row2
420  low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
421  low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
422  high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
423  high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
424  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
425 
426  low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
427  low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
428  high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
429  high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
430  harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
431 
432  // Calculate harris score
433  const float32x4x2_t mc =
434  {
435  {
436  harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
437  harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
438  }
439  };
440 
441  // Store score
442  vst1q_f32(output + 0, mc.val[0]);
443  vst1q_f32(output + 4, mc.val[1]);
444 }
445 
446 inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
447  float in_norm_factor, float in_sensitivity, float in_strength_thresh)
448 {
449  auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
450  auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
451  const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
452  const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
453  const auto output = static_cast<float *__restrict>(output_ptr);
454 
455  // Gx^2, Gy^2 and Gx*Gy
456  float32x4x2_t gx2 =
457  {
458  {
459  vdupq_n_f32(0.0f),
460  vdupq_n_f32(0.0f)
461  }
462  };
463  float32x4x2_t gy2 =
464  {
465  {
466  vdupq_n_f32(0.0f),
467  vdupq_n_f32(0.0f)
468  }
469  };
470  float32x4x2_t gxgy =
471  {
472  {
473  vdupq_n_f32(0.0f),
474  vdupq_n_f32(0.0f)
475  }
476  };
477  float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
478  float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
479  float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
480 
481  for(int i = 0; i < 5; ++i)
482  {
483  const int16x8x2_t tmp_gx =
484  {
485  {
486  vld1q_s16(gx_ptr_0),
487  vld1q_s16(gx_ptr_1)
488  }
489  };
490  const int16x8x2_t tmp_gy =
491  {
492  {
493  vld1q_s16(gy_ptr_0),
494  vld1q_s16(gy_ptr_1)
495  }
496  };
497 
498  float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
499  float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
500  float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
501  float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
502  harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
503 
504  low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
505  low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
506  high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
507  high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
508  harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
509 
510  // Update gx and gy pointer
511  gx_ptr_0 += input_stride;
512  gy_ptr_0 += input_stride;
513  gx_ptr_1 += input_stride;
514  gy_ptr_1 += input_stride;
515  }
516 
517  // Calculate harris score
518  const float32x4x2_t mc =
519  {
520  {
521  harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
522  harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
523  }
524  };
525 
526  // Store score
527  vst1q_f32(output + 0, mc.val[0]);
528  vst1q_f32(output + 4, mc.val[1]);
529 }
530 
531 inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
532  float in_norm_factor, float in_sensitivity, float in_strength_thresh)
533 
534 {
535  auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
536  auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
537  const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
538  const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
539  const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
540  const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
541  const auto output = static_cast<float *__restrict>(output_ptr);
542 
543  // Gx^2, Gy^2 and Gx*Gy
544  float32x4x2_t gx2 =
545  {
546  {
547  vdupq_n_f32(0.0f),
548  vdupq_n_f32(0.0f)
549  }
550  };
551  float32x4x2_t gy2 =
552  {
553  {
554  vdupq_n_f32(0.0f),
555  vdupq_n_f32(0.0f)
556  }
557  };
558  float32x4x2_t gxgy =
559  {
560  {
561  vdupq_n_f32(0.0f),
562  vdupq_n_f32(0.0f)
563  }
564  };
565  float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
566  float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
567  float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
568 
569  for(int i = 0; i < 5; ++i)
570  {
571  const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
572  const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
573  const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
574  const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
575  harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
576 
577  const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
578  const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
579  const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
580  const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
581  harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
582 
583  // Update gx and gy pointer
584  gx_ptr_0 += input_stride;
585  gy_ptr_0 += input_stride;
586  gx_ptr_1 += input_stride;
587  gy_ptr_1 += input_stride;
588  gx_ptr_2 += input_stride;
589  gy_ptr_2 += input_stride;
590  }
591 
592  // Calculate harris score
593  const float32x4x2_t mc =
594  {
595  {
596  harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
597  harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
598  }
599  };
600 
601  // Store score
602  vst1q_f32(output + 0, mc.val[0]);
603  vst1q_f32(output + 4, mc.val[1]);
604 }
605 
606 inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
607  float in_norm_factor, float in_sensitivity, float in_strength_thresh)
608 {
609  auto gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
610  auto gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
611  const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
612  const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
613  const auto output = static_cast<float *__restrict>(output_ptr);
614 
615  // Gx^2, Gy^2 and Gx*Gy
616  float32x4_t gx2 = vdupq_n_f32(0.0f);
617  float32x4_t gy2 = vdupq_n_f32(0.0f);
618  float32x4_t gxgy = vdupq_n_f32(0.0f);
619  float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
620  float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
621  float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
622 
623  for(int i = 0; i < 7; ++i)
624  {
625  const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
626  const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
627  const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
628  const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
629 
630  float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
631  float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
632  float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
633  float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
634  float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
635  float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
636  harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
637 
638  // Update gx and gy pointer
639  gx_ptr_0 += input_stride;
640  gy_ptr_0 += input_stride;
641  gx_ptr_1 += input_stride;
642  gy_ptr_1 += input_stride;
643  }
644 
645  // Calculate harris score
646  const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
647 
648  // Store score
649  vst1q_f32(output, mc);
650 }
651 
652 inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
653  float in_norm_factor, float in_sensitivity, float in_strength_thresh)
654 {
655  auto gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
656  auto gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
657  const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
658  const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
659  const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
660  const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
661  const auto output = static_cast<float *__restrict>(output_ptr);
662 
663  // Gx^2, Gy^2 and Gx*Gy
664  float32x4_t gx2 = vdupq_n_f32(0.0f);
665  float32x4_t gy2 = vdupq_n_f32(0.0f);
666  float32x4_t gxgy = vdupq_n_f32(0.0f);
667  float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
668  float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
669  float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
670 
671  for(int i = 0; i < 7; ++i)
672  {
673  const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
674  const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
675  const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
676  const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
677  const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
678  const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
679  harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
680 
681  // Update gx and gy pointer
682  gx_ptr_0 += input_stride;
683  gy_ptr_0 += input_stride;
684  gx_ptr_1 += input_stride;
685  gy_ptr_1 += input_stride;
686  gx_ptr_2 += input_stride;
687  gy_ptr_2 += input_stride;
688  }
689 
690  // Calculate harris score
691  const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
692 
693  // Store score
694  vst1q_f32(output, mc);
695 }
696 
697 } // namespace
698 
700  : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
701 {
702 }
703 
704 template <int32_t block_size>
706  : INEHarrisScoreKernel(), _func(nullptr)
707 {
708 }
709 
710 template <int32_t block_size>
712 {
713  ARM_COMPUTE_UNUSED(info);
716  ARM_COMPUTE_ERROR_ON(_func == nullptr);
717 
718  Iterator input1(_input1, window);
719  Iterator input2(_input2, window);
720  Iterator output(_output, window);
721 
722  const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
723 
724  execute_window_loop(window, [&](const Coordinates &)
725  {
726  (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
727  },
728  input1, input2, output);
729 }
730 
731 template <int32_t block_size>
733 {
734  return _border_size;
735 }
736 
737 template <int32_t block_size>
738 void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
739  bool border_undefined)
740 {
748  ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
749 
750  _input1 = input1;
751  _input2 = input2;
752  _output = output;
753  _sensitivity = sensitivity;
754  _strength_thresh = strength_thresh;
755  _norm_factor = norm_factor;
756  _border_size = BorderSize(block_size / 2);
757 
758  if(input1->info()->data_type() == DataType::S16)
759  {
760  switch(block_size)
761  {
762  case 3:
763  _func = &harris_score3x3_S16_S16_FLOAT;
764  break;
765  case 5:
766  _func = &harris_score5x5_S16_S16_FLOAT;
767  break;
768  case 7:
769  _func = &harris_score7x7_S16_S16_FLOAT;
770  break;
771  default:
772  ARM_COMPUTE_ERROR("Invalid block size");
773  break;
774  }
775  }
776  else
777  {
778  switch(block_size)
779  {
780  case 3:
781  _func = &harris_score3x3_S32_S32_FLOAT;
782  break;
783  case 5:
784  _func = &harris_score5x5_S32_S32_FLOAT;
785  break;
786  case 7:
787  _func = &harris_score7x7_S32_S32_FLOAT;
788  break;
789  default:
790  ARM_COMPUTE_ERROR("Invalid block size");
791  break;
792  }
793  }
794 
795  ARM_COMPUTE_ERROR_ON(nullptr == _func);
796 
797  constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
798  constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
799  constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
800  constexpr unsigned int num_rows_read_per_iteration = block_size;
801 
802  // Configure kernel window
803  Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
804  AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
805 
807  AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
808  AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
809  output_access);
810 
812  input2->info()->valid_region());
813 
814  output_access.set_valid_region(win, valid_region, border_undefined, border_size());
815 
816  INEKernel::configure(win);
817 }
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Definition: Validate.h:856
Container for 2D border size.
Definition: Types.h:273
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
Definition: Utils.h:185
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Template Neon kernel to perform Harris Score.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
const ValidRegion valid_region
Definition: Scale.cpp:221
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
1 channel, 1 S32 per channel
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:543
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
BorderSize border_size() const override
The size of the border for that kernel.
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
ValidRegion intersect_valid_regions(const Ts &... regions)
Intersect multiple valid regions.
Definition: WindowHelpers.h:74
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override
Setup the kernel parameters.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
Container for valid region of a window.
Definition: Types.h:188
Common interface for all Harris Score kernels.
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205