Compute Library
 21.02
NENonLinearFilterKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
34 
35 #include <algorithm>
36 #include <arm_neon.h>
37 #include <array>
38 #include <tuple>
39 #include <utility>
40 
41 namespace arm_compute
42 {
43 namespace
44 {
45 const uint8x16_t zero_u8 = vdupq_n_u8(0);
46 
47 template <size_t columns>
48 inline uint8x8_t min_row(uint8x16_t row_data)
49 {
50  uint8x8_t min = vget_low_u8(row_data);
51 
52  for(size_t c = 1; c < columns; ++c)
53  {
54  row_data = vextq_u8(row_data, zero_u8, 1);
55  min = vmin_u8(min, vget_low_u8(row_data));
56  }
57 
58  return min;
59 }
60 
61 template <size_t columns>
62 inline uint8x8_t max_row(uint8x16_t row_data)
63 {
64  uint8x8_t max = vget_low_u8(row_data);
65 
66  for(size_t c = 1; c < columns; ++c)
67  {
68  row_data = vextq_u8(row_data, zero_u8, 1);
69  max = vmax_u8(max, vget_low_u8(row_data));
70  }
71 
72  return max;
73 }
74 
75 inline void sort(uint8x8_t &a, uint8x8_t &b)
76 {
77  const uint8x8_t min = vmin_u8(a, b);
78  const uint8x8_t max = vmax_u8(a, b);
79  a = min;
80  b = max;
81 }
82 
83 // Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
84 // Calculations that do not affect the median were removed.
85 inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
86 {
87  sort(p0, p1);
88  sort(p2, p3);
89  sort(p0, p2);
90  sort(p1, p3);
91  sort(p1, p2);
92  sort(p0, p4);
93  sort(p1, p4);
94  sort(p2, p4);
95 }
96 
97 inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
98  uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
99  uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
100 {
101  sort(p1, p2);
102  sort(p4, p5);
103  sort(p7, p8);
104  sort(p0, p1);
105  sort(p3, p4);
106  sort(p6, p7);
107  sort(p1, p2);
108  sort(p4, p5);
109  sort(p7, p8);
110  sort(p0, p3);
111  sort(p5, p8);
112  sort(p4, p7);
113  sort(p3, p6);
114  sort(p1, p4);
115  sort(p2, p5);
116  sort(p4, p7);
117  sort(p4, p2);
118  sort(p6, p4);
119  sort(p4, p2);
120 }
121 
122 inline void sort21(std::array<uint8x8_t, 21> &p)
123 {
124  sort(p[0], p[1]);
125  sort(p[2], p[3]);
126  sort(p[4], p[5]);
127  sort(p[6], p[7]);
128  sort(p[8], p[9]);
129  sort(p[10], p[11]);
130  sort(p[12], p[13]);
131  sort(p[14], p[15]);
132  sort(p[16], p[17]);
133  sort(p[18], p[19]);
134  sort(p[0], p[2]);
135  sort(p[1], p[3]);
136  sort(p[4], p[6]);
137  sort(p[5], p[7]);
138  sort(p[8], p[10]);
139  sort(p[9], p[11]);
140  sort(p[12], p[14]);
141  sort(p[13], p[15]);
142  sort(p[16], p[18]);
143  sort(p[17], p[19]);
144  sort(p[1], p[2]);
145  sort(p[5], p[6]);
146  sort(p[0], p[4]);
147  sort(p[3], p[7]);
148  sort(p[9], p[10]);
149  sort(p[13], p[14]);
150  sort(p[8], p[12]);
151  sort(p[11], p[15]);
152  sort(p[17], p[18]);
153  sort(p[16], p[20]);
154  sort(p[1], p[5]);
155  sort(p[2], p[6]);
156  sort(p[9], p[13]);
157  sort(p[10], p[14]);
158  sort(p[0], p[8]);
159  sort(p[7], p[15]);
160  sort(p[17], p[20]);
161  sort(p[1], p[4]);
162  sort(p[3], p[6]);
163  sort(p[9], p[12]);
164  sort(p[11], p[14]);
165  sort(p[18], p[20]);
166  sort(p[0], p[16]);
167  sort(p[2], p[4]);
168  sort(p[3], p[5]);
169  sort(p[10], p[12]);
170  sort(p[11], p[13]);
171  sort(p[1], p[9]);
172  sort(p[6], p[14]);
173  sort(p[19], p[20]);
174  sort(p[3], p[4]);
175  sort(p[11], p[12]);
176  sort(p[1], p[8]);
177  sort(p[2], p[10]);
178  sort(p[5], p[13]);
179  sort(p[7], p[14]);
180  sort(p[3], p[11]);
181  sort(p[2], p[8]);
182  sort(p[4], p[12]);
183  sort(p[7], p[13]);
184  sort(p[1], p[17]);
185  sort(p[3], p[10]);
186  sort(p[5], p[12]);
187  sort(p[1], p[16]);
188  sort(p[2], p[18]);
189  sort(p[3], p[9]);
190  sort(p[6], p[12]);
191  sort(p[2], p[16]);
192  sort(p[3], p[8]);
193  sort(p[7], p[12]);
194  sort(p[5], p[9]);
195  sort(p[6], p[10]);
196  sort(p[4], p[8]);
197  sort(p[7], p[11]);
198  sort(p[3], p[19]);
199  sort(p[5], p[8]);
200  sort(p[7], p[10]);
201  sort(p[3], p[18]);
202  sort(p[4], p[20]);
203  sort(p[6], p[8]);
204  sort(p[7], p[9]);
205  sort(p[3], p[17]);
206  sort(p[5], p[20]);
207  sort(p[7], p[8]);
208  sort(p[3], p[16]);
209  sort(p[6], p[20]);
210  sort(p[5], p[17]);
211  sort(p[7], p[20]);
212  sort(p[4], p[16]);
213  sort(p[6], p[18]);
214  sort(p[5], p[16]);
215  sort(p[7], p[19]);
216  sort(p[7], p[18]);
217  sort(p[6], p[16]);
218  sort(p[7], p[17]);
219  sort(p[10], p[18]);
220  sort(p[7], p[16]);
221  sort(p[9], p[17]);
222  sort(p[8], p[16]);
223  sort(p[9], p[16]);
224  sort(p[10], p[16]);
225 }
226 
227 inline void sort25(std::array<uint8x8_t, 25> &p)
228 {
229  sort(p[1], p[2]);
230  sort(p[0], p[1]);
231  sort(p[1], p[2]);
232  sort(p[4], p[5]);
233  sort(p[3], p[4]);
234  sort(p[4], p[5]);
235  sort(p[0], p[3]);
236  sort(p[2], p[5]);
237  sort(p[2], p[3]);
238  sort(p[1], p[4]);
239  sort(p[1], p[2]);
240  sort(p[3], p[4]);
241  sort(p[7], p[8]);
242  sort(p[6], p[7]);
243  sort(p[7], p[8]);
244  sort(p[10], p[11]);
245  sort(p[9], p[10]);
246  sort(p[10], p[11]);
247  sort(p[6], p[9]);
248  sort(p[8], p[11]);
249  sort(p[8], p[9]);
250  sort(p[7], p[10]);
251  sort(p[7], p[8]);
252  sort(p[9], p[10]);
253  sort(p[0], p[6]);
254  sort(p[4], p[10]);
255  sort(p[4], p[6]);
256  sort(p[2], p[8]);
257  sort(p[2], p[4]);
258  sort(p[6], p[8]);
259  sort(p[1], p[7]);
260  sort(p[5], p[11]);
261  sort(p[5], p[7]);
262  sort(p[3], p[9]);
263  sort(p[3], p[5]);
264  sort(p[7], p[9]);
265  sort(p[1], p[2]);
266  sort(p[3], p[4]);
267  sort(p[5], p[6]);
268  sort(p[7], p[8]);
269  sort(p[9], p[10]);
270  sort(p[13], p[14]);
271  sort(p[12], p[13]);
272  sort(p[13], p[14]);
273  sort(p[16], p[17]);
274  sort(p[15], p[16]);
275  sort(p[16], p[17]);
276  sort(p[12], p[15]);
277  sort(p[14], p[17]);
278  sort(p[14], p[15]);
279  sort(p[13], p[16]);
280  sort(p[13], p[14]);
281  sort(p[15], p[16]);
282  sort(p[19], p[20]);
283  sort(p[18], p[19]);
284  sort(p[19], p[20]);
285  sort(p[21], p[22]);
286  sort(p[23], p[24]);
287  sort(p[21], p[23]);
288  sort(p[22], p[24]);
289  sort(p[22], p[23]);
290  sort(p[18], p[21]);
291  sort(p[20], p[23]);
292  sort(p[20], p[21]);
293  sort(p[19], p[22]);
294  sort(p[22], p[24]);
295  sort(p[19], p[20]);
296  sort(p[21], p[22]);
297  sort(p[23], p[24]);
298  sort(p[12], p[18]);
299  sort(p[16], p[22]);
300  sort(p[16], p[18]);
301  sort(p[14], p[20]);
302  sort(p[20], p[24]);
303  sort(p[14], p[16]);
304  sort(p[18], p[20]);
305  sort(p[22], p[24]);
306  sort(p[13], p[19]);
307  sort(p[17], p[23]);
308  sort(p[17], p[19]);
309  sort(p[15], p[21]);
310  sort(p[15], p[17]);
311  sort(p[19], p[21]);
312  sort(p[13], p[14]);
313  sort(p[15], p[16]);
314  sort(p[17], p[18]);
315  sort(p[19], p[20]);
316  sort(p[21], p[22]);
317  sort(p[23], p[24]);
318  sort(p[0], p[12]);
319  sort(p[8], p[20]);
320  sort(p[8], p[12]);
321  sort(p[4], p[16]);
322  sort(p[16], p[24]);
323  sort(p[12], p[16]);
324  sort(p[2], p[14]);
325  sort(p[10], p[22]);
326  sort(p[10], p[14]);
327  sort(p[6], p[18]);
328  sort(p[6], p[10]);
329  sort(p[10], p[12]);
330  sort(p[1], p[13]);
331  sort(p[9], p[21]);
332  sort(p[9], p[13]);
333  sort(p[5], p[17]);
334  sort(p[13], p[17]);
335  sort(p[3], p[15]);
336  sort(p[11], p[23]);
337  sort(p[11], p[15]);
338  sort(p[7], p[19]);
339  sort(p[7], p[11]);
340  sort(p[11], p[13]);
341  sort(p[11], p[12]);
342 }
343 } // namespace
344 
346  : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size()
347 {
348 }
349 
351 {
352  return _border_size;
353 }
354 
355 void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
356  bool border_undefined)
357 {
360  ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size);
361  ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask);
362 
363  // Set class variables
364  _border_size = BorderSize(mask_size / 2);
365  _input = input;
366  _output = output;
367  _mask = mask;
368  _pattern = pattern;
369  _function = function;
370 
371  // Configure kernel window
372  const unsigned int num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8;
373  constexpr unsigned int num_elems_read_per_iteration = 16;
374 
375  Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
378  AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size),
379  output_access);
380  output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
381 
382  INEKernel::configure(win);
383 
384  // Define function index
385  _func_idx = (3 == mask_size) ? 0 : 1;
386 
387  if(MatrixPattern::OTHER != pattern)
388  {
389  _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(function);
390  }
391 }
392 
393 void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
394 {
395  unsigned int v = 0;
396 
397  for(int r = 0; r < rows; ++r)
398  {
399  for(int c = 0; c < cols; ++c, ++v)
400  {
401  uint8_t val = 0;
402 
403  switch(pattern)
404  {
405  case MatrixPattern::BOX:
406  val = 255;
407  break;
409  val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
410  break;
411  case MatrixPattern::DISK:
412  val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
413  (cols / 2.0f))) <= 1.0f ? 255 : 0;
414  break;
415  default:
416  return;
417  }
418 
419  mask[v] = val;
420  }
421  }
422 }
423 
424 template <>
425 void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
426 {
427  Iterator input(_input, win);
428  Iterator output(_output, win);
429 
430  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -1)));
431  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
432  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
433 
434  execute_window_loop(win, [&](const Coordinates &)
435  {
436  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
437  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
438  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
439 
440  uint8x8_t p0 = vget_low_u8(top_data);
441  uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
442  uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
443  uint8x8_t p3 = vget_low_u8(mid_data);
444  uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
445  uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
446  uint8x8_t p6 = vget_low_u8(bot_data);
447  uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
448  uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
449 
450  sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
451 
452  vst1_u8(output.ptr(), p4);
453  },
454  input, output);
455 }
456 template <>
457 void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win)
458 {
459  Iterator input(_input, win);
460  Iterator output(_output, win);
461 
462  const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
463  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
464  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
465  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
466  const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
467 
468  execute_window_loop(win, [&](const Coordinates &)
469  {
470  const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
471  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
472  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
473  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
474  const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
475 
476  const std::array<uint8x8_t, 10> d =
477  {
478  vget_low_u8(top2_data),
479  vget_high_u8(top2_data),
480  vget_low_u8(top_data),
481  vget_high_u8(top_data),
482  vget_low_u8(mid_data),
483  vget_high_u8(mid_data),
484  vget_low_u8(bot_data),
485  vget_high_u8(bot_data),
486  vget_low_u8(bot2_data),
487  vget_high_u8(bot2_data)
488  };
489 
490  std::array<uint8x8_t, 25> p{ 0 };
491  for(unsigned int i = 0; i < 5; ++i)
492  {
493  const unsigned int idx_d = i * 2;
494  const unsigned int idx_p = i * 5;
495 
496  p[idx_p] = d[idx_d];
497  p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
498  p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
499  p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
500  p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
501  }
502 
503  sort25(p);
504 
505  vst1_u8(output.ptr(), p[12]);
506  },
507  input, output);
508 }
509 
510 template <int mask_w, int mask_h>
511 void NENonLinearFilterKernel::min_filter_box(const Window &win)
512 {
513  static_assert(mask_w > 0, "Mask size must not be 0");
514  static_assert(mask_h > 0, "Mask size must not be 0");
515 
516  Iterator input(_input, win);
517  Iterator output(_output, win);
518 
519  const int k_row_half = mask_h / 2;
520  const int k_col_half = mask_w / 2;
521 
522  // Set row pointers
523  std::array<const unsigned char *, mask_h> input_ptrs{ {} };
524  for(int i = -k_row_half; i <= k_row_half; ++i)
525  {
526  input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
527  }
528 
529  execute_window_loop(win, [&](const Coordinates &)
530  {
531  // Get min of rows
532  uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
533 
534  for(unsigned int r = 1; r < mask_h; ++r)
535  {
536  const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
537  rows_min = vminq_u8(rows_min, data);
538  }
539 
540  const uint8x8_t out = min_row<mask_w>(rows_min);
541 
542  // Store result as U8
543  vst1_u8(output.ptr(), out);
544  },
545  input, output);
546 }
547 
548 template <int mask_w, int mask_h>
549 void NENonLinearFilterKernel::max_filter_box(const Window &win)
550 {
551  static_assert(mask_w > 0, "Mask size must not be 0");
552  static_assert(mask_h > 0, "Mask size must not be 0");
553  ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
554 
555  Iterator input(_input, win);
556  Iterator output(_output, win);
557 
558  const int k_row_half = mask_h / 2;
559  const int k_col_half = mask_w / 2;
560 
561  // Set row pointers
562  std::array<const unsigned char *, mask_h> input_ptrs{ {} };
563  for(int i = -k_row_half; i <= k_row_half; ++i)
564  {
565  input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
566  }
567 
568  execute_window_loop(win, [&](const Coordinates &)
569  {
570  uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
571 
572  // Get max of rows
573  for(unsigned int r = 1; r < mask_h; ++r)
574  {
575  const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
576  rows_max = vmaxq_u8(rows_max, data);
577  }
578 
579  // Get max of columns
580  const uint8x8_t out = max_row<mask_w>(rows_max);
581 
582  // Store result as U8
583  vst1_u8(output.ptr(), out);
584  },
585  input, output);
586 }
587 
588 template <>
589 void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
590 {
591  Iterator input(_input, win);
592  Iterator output(_output, win);
593 
594  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
595  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
596  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
597 
598  execute_window_loop(win, [&](const Coordinates &)
599  {
600  const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
601  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
602  const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset());
603 
604  uint8x8_t p0 = top_data;
605  uint8x8_t p1 = vget_low_u8(mid_data);
606  uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
607  uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
608  uint8x8_t p4 = bot_data;
609 
610  sort5(p0, p1, p2, p3, p4);
611 
612  vst1_u8(output.ptr(), p2);
613  },
614  input, output);
615 }
616 
617 template <>
618 void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win)
619 {
620  Iterator input(_input, win);
621  Iterator output(_output, win);
622 
623  const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -2)));
624  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
625  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
626  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
627  const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
628 
629  execute_window_loop(win, [&](const Coordinates &)
630  {
631  const uint8x8_t top2_data = vld1_u8(input_top2_ptr + input.offset());
632  const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset());
633  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
634  const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset());
635  const uint8x8_t bot2_data = vld1_u8(input_bot2_ptr + input.offset());
636 
637  uint8x8_t p0 = top2_data;
638  uint8x8_t p1 = top_data;
639  uint8x8_t p2 = vget_low_u8(mid_data);
640  uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
641  uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
642  uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
643  uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
644  uint8x8_t p7 = bot_data;
645  uint8x8_t p8 = bot2_data;
646 
647  sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
648 
649  vst1_u8(output.ptr(), p4);
650  },
651  input, output);
652 }
653 
654 template <int mask_w, int mask_h>
655 void NENonLinearFilterKernel::min_filter_cross(const Window &win)
656 {
657  static_assert(mask_w > 0, "Mask size must not be 0");
658  static_assert(mask_h > 0, "Mask size must not be 0");
659  ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
660 
661  Iterator input(_input, win);
662  Iterator output(_output, win);
663 
664  const int k_row_half = mask_h / 2;
665  const int k_col_half = mask_w / 2;
666 
667  const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
668 
669  // Set row pointers
670  std::array<const unsigned char *, mask_h> input_ptrs{ {} };
671  for(int i = -k_row_half; i <= k_row_half; ++i)
672  {
673  input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
674  }
675 
676  execute_window_loop(win, [&](const Coordinates &)
677  {
678  uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
679 
680  // Get min of rows
681  for(unsigned int r = 1; r < mask_h; ++r)
682  {
683  const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
684  rows_min = vmin_u8(rows_min, data);
685  }
686 
687  // Get min of middle row
688  const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
689  uint8x8_t out = min_row<mask_w>(data);
690 
691  // Get final min
692  out = vmin_u8(out, rows_min);
693 
694  // Store result as U8
695  vst1_u8(output.ptr(), out);
696  },
697  input, output);
698 }
699 
700 template <int mask_w, int mask_h>
701 void NENonLinearFilterKernel::max_filter_cross(const Window &win)
702 {
703  static_assert(mask_w > 0, "Mask size must not be 0");
704  static_assert(mask_h > 0, "Mask size must not be 0");
705  ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
706 
707  Iterator input(_input, win);
708  Iterator output(_output, win);
709 
710  const int k_row_half = mask_h / 2;
711  const int k_col_half = mask_w / 2;
712 
713  const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
714 
715  // Set row pointers
716  std::array<unsigned char *, mask_h> input_ptrs{ {} };
717  for(int i = -k_row_half; i <= k_row_half; ++i)
718  {
719  input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
720  }
721 
722  execute_window_loop(win, [&](const Coordinates &)
723  {
724  uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
725 
726  // Get max of rows
727  for(unsigned int r = 1; r < mask_h; ++r)
728  {
729  const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
730  rows_max = vmax_u8(rows_max, data);
731  }
732 
733  // Get max of middle row
734  const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
735  uint8x8_t out = max_row<mask_w>(data);
736 
737  // Get final max
738  out = vmax_u8(out, rows_max);
739 
740  // Store result as U8
741  vst1_u8(output.ptr(), out);
742  },
743  input, output);
744 }
745 
746 template <>
747 void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
748 {
749  Iterator input(_input, win);
750  Iterator output(_output, win);
751 
752  static const uint8x16_t zero = vdupq_n_u8(0);
753  const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
754  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
755  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
756  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
757  const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
758 
759  execute_window_loop(win, [&](const Coordinates &)
760  {
761  const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
762  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
763  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
764  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
765  const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
766 
767  std::array<uint8x8_t, 10> d =
768  {
769  vget_low_u8(top2_data),
770  vget_high_u8(top2_data),
771  vget_low_u8(top_data),
772  vget_high_u8(top_data),
773  vget_low_u8(mid_data),
774  vget_high_u8(mid_data),
775  vget_low_u8(bot_data),
776  vget_high_u8(bot_data),
777  vget_low_u8(bot2_data),
778  vget_high_u8(bot2_data)
779  };
780 
781  std::array<uint8x8_t, 21> p{ 0 };
782  p[0] = d[0];
783  p[1] = vext_u8(d[0], d[1], 1);
784  p[2] = vext_u8(d[0], d[1], 2);
785  p[18] = d[8];
786  p[19] = vext_u8(d[8], d[9], 1);
787  p[20] = vext_u8(d[8], d[9], 2);
788 
789  for(unsigned int i = 0; i < 3; ++i)
790  {
791  const unsigned int idx_d = 2 + i * 2;
792  const unsigned int idx_p = 3 + i * 5;
793 
794  p[idx_p] = d[idx_d];
795  p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
796  p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
797  p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
798  p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
799  }
800 
801  sort21(p);
802 
803  vst1_u8(output.ptr(), p[10]);
804  },
805  input, output);
806 }
807 
808 template <>
809 void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win)
810 {
811  Iterator input(_input, win);
812  Iterator output(_output, win);
813 
814  static const uint8x16_t zero = vdupq_n_u8(0);
815  const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
816  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
817  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
818  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
819  const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
820 
821  execute_window_loop(win, [&](const Coordinates &)
822  {
823  const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
824  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
825  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
826  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
827  const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
828 
829  const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
830  uint8x16_t rows_min_5 = vminq_u8(top_data, bot_data);
831  rows_min_5 = vminq_u8(rows_min_5, mid_data);
832 
833  const uint8x8_t out_3 = min_row<3>(rows_min_3);
834  const uint8x8_t out_5 = min_row<5>(rows_min_5);
835 
836  vst1_u8(output.ptr(), vmin_u8(out_3, out_5));
837  },
838  input, output);
839 }
840 
841 template <>
842 void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win)
843 {
844  Iterator input(_input, win);
845  Iterator output(_output, win);
846 
847  static const uint8x16_t zero = vdupq_n_u8(0);
848  const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
849  const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
850  const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
851  const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
852  const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
853 
854  execute_window_loop(win, [&](const Coordinates &)
855  {
856  const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
857  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
858  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
859  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
860  const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
861 
862  const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
863  uint8x16_t rows_max_5 = vmaxq_u8(top_data, bot_data);
864  rows_max_5 = vmaxq_u8(rows_max_5, mid_data);
865 
866  const uint8x8_t out_3 = max_row<3>(rows_max_3);
867  const uint8x8_t out_5 = max_row<5>(rows_max_5);
868 
869  vst1_u8(output.ptr(), vmax_u8(out_3, out_5));
870  },
871  input, output);
872 }
873 
874 template <int mask_w, int mask_h>
875 void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
876 {
877  Iterator input(_input, win);
878  Iterator output(_output, win);
879  ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
880 
881  const int k_row_half = mask_h / 2;
882  const int k_col_half = mask_w / 2;
883  constexpr int mask_size = mask_w * mask_h;
884 
885  // Set row pointers
886  std::array<unsigned char *, mask_h> input_ptrs{ {} };
887  for(int i = -k_row_half; i <= k_row_half; ++i)
888  {
889  input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
890  }
891 
892  std::array<uint8_t, mask_size> vals{ {} };
893 
894  execute_window_loop(win, [&](const Coordinates &)
895  {
896  // Clear array
897  std::fill(std::begin(vals), std::end(vals), 0);
898 
899  size_t v = 0;
900  size_t m = 0;
901 
902  for(unsigned int r = 0; r < mask_h; ++r)
903  {
904  const auto in_ptr = static_cast<const uint8_t *>(input_ptrs[r] + input.offset());
905 
906  for(unsigned int c = 0; c < mask_w; ++c, ++m)
907  {
908  if(_mask[m] == 255)
909  {
910  vals[v] = in_ptr[c];
911  ++v;
912  }
913  }
914  }
915 
916  // Only do something if there is at least one non-zero element in the
917  // mask
918  if(v > 0)
919  {
920  std::sort(vals.begin(), vals.begin() + v);
921 
922  switch(_function)
923  {
925  *output.ptr() = vals[0];
926  break;
928  *output.ptr() = vals[v - 1];
929  break;
931  *output.ptr() = vals[v / 2];
932  break;
933  default:
934  break;
935  }
936  }
937  },
938  input, output);
939 }
940 
942 {
943  ARM_COMPUTE_UNUSED(info);
946 
948 
949  // Function table for BOX pattern
950  static const std::array<NonLinearFilterFunction, 6> func_table_box =
951  {
952  {
953  &NENonLinearFilterKernel::median_filter_box<3, 3>,
954  &NENonLinearFilterKernel::min_filter_box<3, 3>,
955  &NENonLinearFilterKernel::max_filter_box<3, 3>,
956  &NENonLinearFilterKernel::median_filter_box<5, 5>,
957  &NENonLinearFilterKernel::min_filter_box<5, 5>,
958  &NENonLinearFilterKernel::max_filter_box<5, 5>,
959  }
960  };
961 
962  // Function table for CROSS pattern
963  static const std::array<NonLinearFilterFunction, 6> func_table_cross =
964  {
965  {
966  &NENonLinearFilterKernel::median_filter_cross<3, 3>,
967  &NENonLinearFilterKernel::min_filter_cross<3, 3>,
968  &NENonLinearFilterKernel::max_filter_cross<3, 3>,
969  &NENonLinearFilterKernel::median_filter_cross<5, 5>,
970  &NENonLinearFilterKernel::min_filter_cross<5, 5>,
971  &NENonLinearFilterKernel::max_filter_cross<5, 5>,
972  }
973  };
974 
975  // Function table for DISK pattern
976  static const std::array<NonLinearFilterFunction, 6> func_table_disk =
977  {
978  {
979  &NENonLinearFilterKernel::median_filter_box<3, 3>,
980  &NENonLinearFilterKernel::min_filter_box<3, 3>,
981  &NENonLinearFilterKernel::max_filter_box<3, 3>,
982  &NENonLinearFilterKernel::median_filter_disk<5, 5>,
983  &NENonLinearFilterKernel::min_filter_disk<5, 5>,
984  &NENonLinearFilterKernel::max_filter_disk<5, 5>,
985  }
986  };
987 
988  // Function table for OTHER pattern
989  static const std::array<NonLinearFilterFunction, 2> func_table_generic =
990  {
991  {
992  &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
993  &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
994  }
995  };
996 
997  switch(_pattern)
998  {
999  case MatrixPattern::BOX:
1000  ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size());
1001  (this->*func_table_box[_func_idx])(window);
1002  break;
1003  case MatrixPattern::CROSS:
1004  ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size());
1005  (this->*func_table_cross[_func_idx])(window);
1006  break;
1007  case MatrixPattern::DISK:
1008  ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size());
1009  (this->*func_table_disk[_func_idx])(window);
1010  break;
1011  case MatrixPattern::OTHER:
1012  default:
1013  ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size());
1014  (this->*func_table_generic[_func_idx])(window);
1015  break;
1016  }
1017 }
1018 } // namespace arm_compute
unsigned int top
top of the border
Definition: Types.h:375
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
BorderSize border_size() const override
The size of the border for that kernel.
virtual int32_t offset_element_in_bytes(const Coordinates &pos) const =0
The offset in bytes from the beginning of the memory allocation to access the element at position (x...
SimpleTensor< float > b
Definition: DFT.cpp:157
Container for 2D border size.
Definition: Types.h:273
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
#define MIN(x, y)
Interface for Neon tensor.
Definition: ITensor.h:36
uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
Sorting network to sort 5 vectors of 8 elements and return their median.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
library fill(src, distribution, 0)
Coordinates of an item.
Definition: Coordinates.h:37
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
unsigned int left
left of the border
Definition: Types.h:378
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined)
Set the source, destination and border mode of the kernel.
Information about executing thread and CPU.
Definition: CPPTypes.h:235
uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
Sorting network to sort 9 vectors of 8 elements and return their median.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Definition: Helpers.inl:134
Interface for the kernel to apply a non-linear filter.
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
Cross pattern matrix.
MatrixPattern
Available matrix patterns.
Definition: Types.h:504
Any other matrix pattern.
Describe a multidimensional execution window.
Definition: Window.h:39
NonLinearFilterFunction
Available non linear functions.
Definition: Types.h:513
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205