53 inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
56 float32x4_t trace2 = vaddq_f32(gx2, gy2);
57 trace2 = vmulq_f32(trace2, trace2);
60 float32x4_t det = vmulq_f32(gx2, gy2);
61 det = vmlsq_f32(det, gxgy, gxgy);
64 const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
67 const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
69 return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
72 inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
73 float32x4_t norm_factor)
76 low_gx = vmulq_f32(low_gx, norm_factor);
77 low_gy = vmulq_f32(low_gy, norm_factor);
78 high_gx = vmulq_f32(high_gx, norm_factor);
79 high_gy = vmulq_f32(high_gy, norm_factor);
81 const float32x4_t l_gx = low_gx;
82 const float32x4_t l_gy = low_gy;
83 const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
84 const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
85 const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
86 const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
89 gx2 = vmlaq_f32(gx2, l_gx, l_gx);
90 gx2 = vmlaq_f32(gx2, m_gx, m_gx);
91 gx2 = vmlaq_f32(gx2, r_gx, r_gx);
94 gy2 = vmlaq_f32(gy2, l_gy, l_gy);
95 gy2 = vmlaq_f32(gy2, m_gy, m_gy);
96 gy2 = vmlaq_f32(gy2, r_gy, r_gy);
99 gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
100 gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
101 gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
104 inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
105 float32x4_t norm_factor)
108 low_gx = vmulq_f32(low_gx, norm_factor);
109 low_gy = vmulq_f32(low_gy, norm_factor);
110 high_gx = vmulq_f32(high_gx, norm_factor);
111 high_gy = vmulq_f32(high_gy, norm_factor);
114 float32x4_t gx = low_gx;
115 float32x4_t gy = low_gy;
118 gx2 = vmlaq_f32(gx2, gx, gx);
119 gy2 = vmlaq_f32(gy2, gy, gy);
120 gxgy = vmlaq_f32(gxgy, gx, gy);
123 gx = vextq_f32(low_gx, high_gx, 1);
124 gy = vextq_f32(low_gy, high_gy, 1);
127 gx2 = vmlaq_f32(gx2, gx, gx);
128 gy2 = vmlaq_f32(gy2, gy, gy);
129 gxgy = vmlaq_f32(gxgy, gx, gy);
132 gx = vextq_f32(low_gx, high_gx, 2);
133 gy = vextq_f32(low_gy, high_gy, 2);
136 gx2 = vmlaq_f32(gx2, gx, gx);
137 gy2 = vmlaq_f32(gy2, gy, gy);
138 gxgy = vmlaq_f32(gxgy, gx, gy);
141 gx = vextq_f32(low_gx, high_gx, 3);
142 gy = vextq_f32(low_gy, high_gy, 3);
145 gx2 = vmlaq_f32(gx2, gx, gx);
146 gy2 = vmlaq_f32(gy2, gy, gy);
147 gxgy = vmlaq_f32(gxgy, gx, gy);
154 gx2 = vmlaq_f32(gx2, gx, gx);
155 gy2 = vmlaq_f32(gy2, gy, gy);
156 gxgy = vmlaq_f32(gxgy, gx, gy);
159 inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
160 float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
163 low_gx = vmulq_f32(low_gx, norm_factor);
164 low_gy = vmulq_f32(low_gy, norm_factor);
165 high_gx = vmulq_f32(high_gx, norm_factor);
166 high_gy = vmulq_f32(high_gy, norm_factor);
169 float32x4_t gx = low_gx;
170 float32x4_t gy = low_gy;
173 gx2 = vmlaq_f32(gx2, gx, gx);
174 gy2 = vmlaq_f32(gy2, gy, gy);
175 gxgy = vmlaq_f32(gxgy, gx, gy);
178 gx = vextq_f32(low_gx, high_gx, 1);
179 gy = vextq_f32(low_gy, high_gy, 1);
182 gx2 = vmlaq_f32(gx2, gx, gx);
183 gy2 = vmlaq_f32(gy2, gy, gy);
184 gxgy = vmlaq_f32(gxgy, gx, gy);
187 gx = vextq_f32(low_gx, high_gx, 2);
188 gy = vextq_f32(low_gy, high_gy, 2);
191 gx2 = vmlaq_f32(gx2, gx, gx);
192 gy2 = vmlaq_f32(gy2, gy, gy);
193 gxgy = vmlaq_f32(gxgy, gx, gy);
196 gx = vextq_f32(low_gx, high_gx, 3);
197 gy = vextq_f32(low_gy, high_gy, 3);
200 gx2 = vmlaq_f32(gx2, gx, gx);
201 gy2 = vmlaq_f32(gy2, gy, gy);
202 gxgy = vmlaq_f32(gxgy, gx, gy);
209 gx2 = vmlaq_f32(gx2, gx, gx);
210 gy2 = vmlaq_f32(gy2, gy, gy);
211 gxgy = vmlaq_f32(gxgy, gx, gy);
220 high_gx = vmulq_f32(high_gx, norm_factor);
221 high_gy = vmulq_f32(high_gy, norm_factor);
224 gx = vextq_f32(low_gx, high_gx, 1);
225 gy = vextq_f32(low_gy, high_gy, 1);
228 gx2 = vmlaq_f32(gx2, gx, gx);
229 gy2 = vmlaq_f32(gy2, gy, gy);
230 gxgy = vmlaq_f32(gxgy, gx, gy);
233 gx = vextq_f32(low_gx, high_gx, 2);
234 gy = vextq_f32(low_gy, high_gy, 2);
237 gx2 = vmlaq_f32(gx2, gx, gx);
238 gy2 = vmlaq_f32(gy2, gy, gy);
239 gxgy = vmlaq_f32(gxgy, gx, gy);
242 inline void harris_score3x3_S16_S16_FLOAT(
const void *__restrict input1_ptr,
const void *__restrict input2_ptr,
void *__restrict output_ptr, int32_t input_stride,
243 float in_norm_factor,
float in_sensitivity,
float in_strength_thresh)
246 const auto gx_ptr_0 =
static_cast<const int16_t *__restrict
>(input1_ptr) - 1;
247 const auto gy_ptr_0 =
static_cast<const int16_t *__restrict
>(input2_ptr) - 1;
248 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
249 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
250 const auto output =
static_cast<float *__restrict
>(output_ptr);
279 vld1q_s16(gx_ptr_0 - input_stride),
280 vld1q_s16(gx_ptr_1 - input_stride)
286 vld1q_s16(gy_ptr_0 - input_stride),
287 vld1q_s16(gy_ptr_1 - input_stride)
290 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
291 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
292 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
294 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
295 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
296 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
297 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
298 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
300 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
301 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
302 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
303 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
304 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
307 tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
308 tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
309 tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
310 tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
312 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
313 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
314 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
315 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
316 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
318 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
319 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
320 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
321 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
322 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
325 tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
326 tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
327 tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
328 tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
330 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
331 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
332 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
333 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
334 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
336 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
337 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
338 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
339 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
340 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
343 const float32x4x2_t mc =
346 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
347 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
352 vst1q_f32(output + 0, mc.val[0]);
353 vst1q_f32(output + 4, mc.val[1]);
356 inline void harris_score3x3_S32_S32_FLOAT(
const void *__restrict input1_ptr,
const void *__restrict input2_ptr,
void *__restrict output_ptr, int32_t input_stride,
357 float in_norm_factor,
float in_sensitivity,
float in_strength_thresh)
359 auto gx_ptr_0 =
static_cast<const int32_t *__restrict
>(input1_ptr) - 1;
360 auto gy_ptr_0 =
static_cast<const int32_t *__restrict
>(input2_ptr) - 1;
361 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
362 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
363 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
364 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
365 const auto output =
static_cast<float *__restrict
>(output_ptr);
366 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
367 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
368 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
394 float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
395 float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
396 float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
397 float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
398 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
400 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
401 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
402 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
403 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
404 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
407 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
408 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
409 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
410 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
411 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
413 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
414 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
415 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
416 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
417 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
420 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
421 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
422 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
423 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
424 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
426 low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
427 low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
428 high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
429 high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
430 harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
433 const float32x4x2_t mc =
436 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
437 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
442 vst1q_f32(output + 0, mc.val[0]);
443 vst1q_f32(output + 4, mc.val[1]);
446 inline void harris_score5x5_S16_S16_FLOAT(
const void *__restrict input1_ptr,
const void *__restrict input2_ptr,
void *__restrict output_ptr, int32_t input_stride,
447 float in_norm_factor,
float in_sensitivity,
float in_strength_thresh)
449 auto gx_ptr_0 =
static_cast<const int16_t *__restrict
>(input1_ptr) - 2 - 2 * input_stride;
450 auto gy_ptr_0 =
static_cast<const int16_t *__restrict
>(input2_ptr) - 2 - 2 * input_stride;
451 const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
452 const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
453 const auto output =
static_cast<float *__restrict
>(output_ptr);
477 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
478 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
479 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
481 for(
int i = 0; i < 5; ++i)
483 const int16x8x2_t tmp_gx =
490 const int16x8x2_t tmp_gy =
498 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
499 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
500 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
501 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
502 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
504 low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
505 low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
506 high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
507 high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
508 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
511 gx_ptr_0 += input_stride;
512 gy_ptr_0 += input_stride;
513 gx_ptr_1 += input_stride;
514 gy_ptr_1 += input_stride;
518 const float32x4x2_t mc =
521 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
522 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
527 vst1q_f32(output + 0, mc.val[0]);
528 vst1q_f32(output + 4, mc.val[1]);
531 inline void harris_score5x5_S32_S32_FLOAT(
const void *__restrict input1_ptr,
const void *__restrict input2_ptr,
void *__restrict output_ptr, int32_t input_stride,
532 float in_norm_factor,
float in_sensitivity,
float in_strength_thresh)
535 auto gx_ptr_0 =
static_cast<const int32_t *__restrict
>(input1_ptr) - 2 - 2 * input_stride;
536 auto gy_ptr_0 =
static_cast<const int32_t *__restrict
>(input2_ptr) - 2 - 2 * input_stride;
537 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
538 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
539 const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
540 const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
541 const auto output =
static_cast<float *__restrict
>(output_ptr);
565 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
566 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
567 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
569 for(
int i = 0; i < 5; ++i)
571 const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
572 const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
573 const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
574 const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
575 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
577 const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
578 const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
579 const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
580 const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
581 harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
584 gx_ptr_0 += input_stride;
585 gy_ptr_0 += input_stride;
586 gx_ptr_1 += input_stride;
587 gy_ptr_1 += input_stride;
588 gx_ptr_2 += input_stride;
589 gy_ptr_2 += input_stride;
593 const float32x4x2_t mc =
596 harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
597 harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
602 vst1q_f32(output + 0, mc.val[0]);
603 vst1q_f32(output + 4, mc.val[1]);
606 inline void harris_score7x7_S16_S16_FLOAT(
const void *__restrict input1_ptr,
const void *__restrict input2_ptr,
void *__restrict output_ptr, int32_t input_stride,
607 float in_norm_factor,
float in_sensitivity,
float in_strength_thresh)
609 auto gx_ptr_0 =
static_cast<const int16_t *__restrict
>(input1_ptr) - 3 - 3 * input_stride;
610 auto gy_ptr_0 =
static_cast<const int16_t *__restrict
>(input2_ptr) - 3 - 3 * input_stride;
611 const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
612 const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
613 const auto output =
static_cast<float *__restrict
>(output_ptr);
616 float32x4_t gx2 = vdupq_n_f32(0.0f);
617 float32x4_t gy2 = vdupq_n_f32(0.0f);
618 float32x4_t gxgy = vdupq_n_f32(0.0f);
619 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
620 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
621 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
623 for(
int i = 0; i < 7; ++i)
625 const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
626 const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
627 const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
628 const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
630 float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
631 float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
632 float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
633 float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
634 float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
635 float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
636 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
639 gx_ptr_0 += input_stride;
640 gy_ptr_0 += input_stride;
641 gx_ptr_1 += input_stride;
642 gy_ptr_1 += input_stride;
646 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
649 vst1q_f32(output, mc);
652 inline void harris_score7x7_S32_S32_FLOAT(
const void *__restrict input1_ptr,
const void *__restrict input2_ptr,
void *__restrict output_ptr, int32_t input_stride,
653 float in_norm_factor,
float in_sensitivity,
float in_strength_thresh)
655 auto gx_ptr_0 =
static_cast<const int32_t *__restrict
>(input1_ptr) - 3 - 3 * input_stride;
656 auto gy_ptr_0 =
static_cast<const int32_t *__restrict
>(input2_ptr) - 3 - 3 * input_stride;
657 const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
658 const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
659 const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
660 const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
661 const auto output =
static_cast<float *__restrict
>(output_ptr);
664 float32x4_t gx2 = vdupq_n_f32(0.0f);
665 float32x4_t gy2 = vdupq_n_f32(0.0f);
666 float32x4_t gxgy = vdupq_n_f32(0.0f);
667 float32x4_t sensitivity = vdupq_n_f32(in_sensitivity);
668 float32x4_t norm_factor = vdupq_n_f32(in_norm_factor);
669 float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
671 for(
int i = 0; i < 7; ++i)
673 const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
674 const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
675 const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
676 const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
677 const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
678 const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
679 harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
682 gx_ptr_0 += input_stride;
683 gy_ptr_0 += input_stride;
684 gx_ptr_1 += input_stride;
685 gy_ptr_1 += input_stride;
686 gx_ptr_2 += input_stride;
687 gy_ptr_2 += input_stride;
691 const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
694 vst1q_f32(output, mc);
700 : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
704 template <
int32_t block_size>
710 template <
int32_t block_size>
726 (*_func)(input1.
ptr(), input2.
ptr(), output.
ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
728 input1, input2, output);
731 template <
int32_t block_size>
737 template <
int32_t block_size>
739 bool border_undefined)
753 _sensitivity = sensitivity;
754 _strength_thresh = strength_thresh;
755 _norm_factor = norm_factor;
763 _func = &harris_score3x3_S16_S16_FLOAT;
766 _func = &harris_score5x5_S16_S16_FLOAT;
769 _func = &harris_score7x7_S16_S16_FLOAT;
781 _func = &harris_score3x3_S32_S32_FLOAT;
784 _func = &harris_score5x5_S32_S32_FLOAT;
787 _func = &harris_score7x7_S32_S32_FLOAT;
798 constexpr
unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12;
799 constexpr
unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4;
800 constexpr
unsigned int num_rows_read_per_iteration = block_size;
807 AccessWindowRectangle(input1->
info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
808 AccessWindowRectangle(input2->
info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
814 output_access.set_valid_region(win, valid_region, border_undefined,
border_size());
816 INEKernel::configure(win);
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Container for 2D border size.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Template Neon kernel to perform Harris Score.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const ValidRegion valid_region
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
1 channel, 1 S32 per channel
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...)
NEHarrisScoreKernel()
Default constructor.
Class to describe a number of elements in each dimension.
INEHarrisScoreKernel()
Default constructor.
BorderSize border_size() const override
The size of the border for that kernel.
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
ValidRegion intersect_valid_regions(const Ts &... regions)
Intersect multiple valid regions.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override
Setup the kernel parameters.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Container for valid region of a window.
Common interface for all Harris Score kernels.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)