45 const uint8x16_t zero_u8 = vdupq_n_u8(0);
47 template <
size_t columns>
48 inline uint8x8_t min_row(uint8x16_t row_data)
50 uint8x8_t min = vget_low_u8(row_data);
52 for(
size_t c = 1; c < columns; ++c)
54 row_data = vextq_u8(row_data, zero_u8, 1);
55 min = vmin_u8(min, vget_low_u8(row_data));
61 template <
size_t columns>
62 inline uint8x8_t max_row(uint8x16_t row_data)
64 uint8x8_t max = vget_low_u8(row_data);
66 for(
size_t c = 1; c < columns; ++c)
68 row_data = vextq_u8(row_data, zero_u8, 1);
69 max = vmax_u8(max, vget_low_u8(row_data));
75 inline void sort(uint8x8_t &a, uint8x8_t &
b)
77 const uint8x8_t min = vmin_u8(a, b);
78 const uint8x8_t max = vmax_u8(a, b);
85 inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
97 inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
98 uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
99 uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
122 inline void sort21(std::array<uint8x8_t, 21> &p)
227 inline void sort25(std::array<uint8x8_t, 25> &p)
356 bool border_undefined)
369 _function =
function;
373 constexpr
unsigned int num_elems_read_per_iteration = 16;
382 INEKernel::configure(win);
385 _func_idx = (3 == mask_size) ? 0 : 1;
389 _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(
function);
397 for(
int r = 0; r <
rows; ++r)
399 for(
int c = 0; c <
cols; ++c, ++v)
409 val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
412 val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
413 (cols / 2.0f))) <= 1.0f ? 255 : 0;
425 void NENonLinearFilterKernel::median_filter_box<3, 3>(
const Window &win)
436 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.
offset());
437 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
438 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.
offset());
440 uint8x8_t p0 = vget_low_u8(top_data);
441 uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
442 uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
443 uint8x8_t p3 = vget_low_u8(mid_data);
444 uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
445 uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
446 uint8x8_t p6 = vget_low_u8(bot_data);
447 uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
448 uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
450 sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
452 vst1_u8(output.
ptr(), p4);
457 void NENonLinearFilterKernel::median_filter_box<5, 5>(
const Window &win)
470 const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.
offset());
471 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.
offset());
472 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
473 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.
offset());
474 const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.
offset());
476 const std::array<uint8x8_t, 10> d =
478 vget_low_u8(top2_data),
479 vget_high_u8(top2_data),
480 vget_low_u8(top_data),
481 vget_high_u8(top_data),
482 vget_low_u8(mid_data),
483 vget_high_u8(mid_data),
484 vget_low_u8(bot_data),
485 vget_high_u8(bot_data),
486 vget_low_u8(bot2_data),
487 vget_high_u8(bot2_data)
490 std::array<uint8x8_t, 25> p{ 0 };
491 for(
unsigned int i = 0; i < 5; ++i)
493 const unsigned int idx_d = i * 2;
494 const unsigned int idx_p = i * 5;
497 p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
498 p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
499 p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
500 p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
505 vst1_u8(output.
ptr(), p[12]);
510 template <
int mask_w,
int mask_h>
511 void NENonLinearFilterKernel::min_filter_box(
const Window &win)
513 static_assert(mask_w > 0,
"Mask size must not be 0");
514 static_assert(mask_h > 0,
"Mask size must not be 0");
519 const int k_row_half = mask_h / 2;
520 const int k_col_half = mask_w / 2;
523 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
524 for(
int i = -k_row_half; i <= k_row_half; ++i)
532 uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.
offset());
534 for(
unsigned int r = 1; r < mask_h; ++r)
536 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.
offset());
537 rows_min = vminq_u8(rows_min, data);
540 const uint8x8_t out = min_row<mask_w>(rows_min);
543 vst1_u8(output.
ptr(), out);
548 template <
int mask_w,
int mask_h>
549 void NENonLinearFilterKernel::max_filter_box(
const Window &win)
551 static_assert(mask_w > 0,
"Mask size must not be 0");
552 static_assert(mask_h > 0,
"Mask size must not be 0");
558 const int k_row_half = mask_h / 2;
559 const int k_col_half = mask_w / 2;
562 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
563 for(
int i = -k_row_half; i <= k_row_half; ++i)
570 uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
573 for(
unsigned int r = 1; r < mask_h; ++r)
575 const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
576 rows_max = vmaxq_u8(rows_max, data);
580 const uint8x8_t out = max_row<mask_w>(rows_max);
583 vst1_u8(output.
ptr(), out);
589 void NENonLinearFilterKernel::median_filter_cross<3, 3>(
const Window &win)
600 const uint8x8_t top_data = vld1_u8(input_top_ptr + input.
offset());
601 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
602 const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.
offset());
604 uint8x8_t p0 = top_data;
605 uint8x8_t p1 = vget_low_u8(mid_data);
606 uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
607 uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
608 uint8x8_t p4 = bot_data;
610 sort5(p0, p1, p2, p3, p4);
612 vst1_u8(output.
ptr(), p2);
618 void NENonLinearFilterKernel::median_filter_cross<5, 5>(
const Window &win)
631 const uint8x8_t top2_data = vld1_u8(input_top2_ptr + input.
offset());
632 const uint8x8_t top_data = vld1_u8(input_top_ptr + input.
offset());
633 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
634 const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.
offset());
635 const uint8x8_t bot2_data = vld1_u8(input_bot2_ptr + input.
offset());
637 uint8x8_t p0 = top2_data;
638 uint8x8_t p1 = top_data;
639 uint8x8_t p2 = vget_low_u8(mid_data);
640 uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
641 uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
642 uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
643 uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
644 uint8x8_t p7 = bot_data;
645 uint8x8_t p8 = bot2_data;
647 sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
649 vst1_u8(output.
ptr(), p4);
654 template <
int mask_w,
int mask_h>
655 void NENonLinearFilterKernel::min_filter_cross(
const Window &win)
657 static_assert(mask_w > 0,
"Mask size must not be 0");
658 static_assert(mask_h > 0,
"Mask size must not be 0");
664 const int k_row_half = mask_h / 2;
665 const int k_col_half = mask_w / 2;
670 std::array<const unsigned char *, mask_h> input_ptrs{ {} };
671 for(
int i = -k_row_half; i <= k_row_half; ++i)
678 uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
681 for(
unsigned int r = 1; r < mask_h; ++r)
683 const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
684 rows_min = vmin_u8(rows_min, data);
688 const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
689 uint8x8_t out = min_row<mask_w>(data);
692 out = vmin_u8(out, rows_min);
695 vst1_u8(output.
ptr(), out);
700 template <
int mask_w,
int mask_h>
701 void NENonLinearFilterKernel::max_filter_cross(
const Window &win)
703 static_assert(mask_w > 0,
"Mask size must not be 0");
704 static_assert(mask_h > 0,
"Mask size must not be 0");
710 const int k_row_half = mask_h / 2;
711 const int k_col_half = mask_w / 2;
716 std::array<unsigned char *, mask_h> input_ptrs{ {} };
717 for(
int i = -k_row_half; i <= k_row_half; ++i)
724 uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
727 for(
unsigned int r = 1; r < mask_h; ++r)
729 const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
730 rows_max = vmax_u8(rows_max, data);
734 const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
735 uint8x8_t out = max_row<mask_w>(data);
738 out = vmax_u8(out, rows_max);
741 vst1_u8(output.
ptr(), out);
747 void NENonLinearFilterKernel::median_filter_disk<5, 5>(
const Window &win)
752 static const uint8x16_t zero = vdupq_n_u8(0);
761 const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.
offset()), zero, 1);
762 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.
offset());
763 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
764 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.
offset());
765 const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.
offset()), zero, 1);
767 std::array<uint8x8_t, 10> d =
769 vget_low_u8(top2_data),
770 vget_high_u8(top2_data),
771 vget_low_u8(top_data),
772 vget_high_u8(top_data),
773 vget_low_u8(mid_data),
774 vget_high_u8(mid_data),
775 vget_low_u8(bot_data),
776 vget_high_u8(bot_data),
777 vget_low_u8(bot2_data),
778 vget_high_u8(bot2_data)
781 std::array<uint8x8_t, 21> p{ 0 };
783 p[1] = vext_u8(d[0], d[1], 1);
784 p[2] = vext_u8(d[0], d[1], 2);
786 p[19] = vext_u8(d[8], d[9], 1);
787 p[20] = vext_u8(d[8], d[9], 2);
789 for(
unsigned int i = 0; i < 3; ++i)
791 const unsigned int idx_d = 2 + i * 2;
792 const unsigned int idx_p = 3 + i * 5;
795 p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
796 p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
797 p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
798 p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
803 vst1_u8(output.
ptr(), p[10]);
809 void NENonLinearFilterKernel::min_filter_disk<5, 5>(
const Window &win)
814 static const uint8x16_t zero = vdupq_n_u8(0);
823 const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.
offset()), zero, 1);
824 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.
offset());
825 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
826 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.
offset());
827 const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.
offset()), zero, 1);
829 const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
830 uint8x16_t rows_min_5 = vminq_u8(top_data, bot_data);
831 rows_min_5 = vminq_u8(rows_min_5, mid_data);
833 const uint8x8_t out_3 = min_row<3>(rows_min_3);
834 const uint8x8_t out_5 = min_row<5>(rows_min_5);
836 vst1_u8(output.
ptr(), vmin_u8(out_3, out_5));
842 void NENonLinearFilterKernel::max_filter_disk<5, 5>(
const Window &win)
847 static const uint8x16_t zero = vdupq_n_u8(0);
856 const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.
offset()), zero, 1);
857 const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.
offset());
858 const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.
offset());
859 const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.
offset());
860 const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.
offset()), zero, 1);
862 const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
863 uint8x16_t rows_max_5 = vmaxq_u8(top_data, bot_data);
864 rows_max_5 = vmaxq_u8(rows_max_5, mid_data);
866 const uint8x8_t out_3 = max_row<3>(rows_max_3);
867 const uint8x8_t out_5 = max_row<5>(rows_max_5);
869 vst1_u8(output.
ptr(), vmax_u8(out_3, out_5));
874 template <
int mask_w,
int mask_h>
875 void NENonLinearFilterKernel::non_linear_filter_generic(
const Window &win)
881 const int k_row_half = mask_h / 2;
882 const int k_col_half = mask_w / 2;
883 constexpr
int mask_size = mask_w * mask_h;
886 std::array<unsigned char *, mask_h> input_ptrs{ {} };
887 for(
int i = -k_row_half; i <= k_row_half; ++i)
892 std::array<uint8_t, mask_size> vals{ {} };
902 for(
unsigned int r = 0; r < mask_h; ++r)
904 const auto in_ptr =
static_cast<const uint8_t *
>(input_ptrs[r] + input.
offset());
906 for(
unsigned int c = 0; c < mask_w; ++c, ++m)
920 std::sort(vals.begin(), vals.begin() + v);
925 *output.
ptr() = vals[0];
928 *output.
ptr() = vals[v - 1];
931 *output.
ptr() = vals[v / 2];
950 static const std::array<NonLinearFilterFunction, 6> func_table_box =
953 &NENonLinearFilterKernel::median_filter_box<3, 3>,
954 &NENonLinearFilterKernel::min_filter_box<3, 3>,
955 &NENonLinearFilterKernel::max_filter_box<3, 3>,
956 &NENonLinearFilterKernel::median_filter_box<5, 5>,
957 &NENonLinearFilterKernel::min_filter_box<5, 5>,
958 &NENonLinearFilterKernel::max_filter_box<5, 5>,
963 static const std::array<NonLinearFilterFunction, 6> func_table_cross =
966 &NENonLinearFilterKernel::median_filter_cross<3, 3>,
967 &NENonLinearFilterKernel::min_filter_cross<3, 3>,
968 &NENonLinearFilterKernel::max_filter_cross<3, 3>,
969 &NENonLinearFilterKernel::median_filter_cross<5, 5>,
970 &NENonLinearFilterKernel::min_filter_cross<5, 5>,
971 &NENonLinearFilterKernel::max_filter_cross<5, 5>,
976 static const std::array<NonLinearFilterFunction, 6> func_table_disk =
979 &NENonLinearFilterKernel::median_filter_box<3, 3>,
980 &NENonLinearFilterKernel::min_filter_box<3, 3>,
981 &NENonLinearFilterKernel::max_filter_box<3, 3>,
982 &NENonLinearFilterKernel::median_filter_disk<5, 5>,
983 &NENonLinearFilterKernel::min_filter_disk<5, 5>,
984 &NENonLinearFilterKernel::max_filter_disk<5, 5>,
989 static const std::array<NonLinearFilterFunction, 2> func_table_generic =
992 &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
993 &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
1001 (this->*func_table_box[_func_idx])(window);
1005 (this->*func_table_cross[_func_idx])(window);
1009 (this->*func_table_disk[_func_idx])(window);
1014 (this->*func_table_generic[_func_idx])(window);
unsigned int top
top of the border
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
BorderSize border_size() const override
The size of the border for that kernel.
virtual int32_t offset_element_in_bytes(const Coordinates &pos) const =0
The offset in bytes from the beginning of the memory allocation to access the element at position (x...
Container for 2D border size.
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Interface for Neon tensor.
uchar8 sort5(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4)
Sorting network to sort 5 vectors of 8 elements and return their median.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
library fill(src, distribution, 0)
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void end(TokenStream &in, bool &valid)
NENonLinearFilterKernel()
Default constructor.
unsigned int left
left of the border
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined)
Set the source, destination and border mode of the kernel.
Information about executing thread and CPU.
uchar8 sort9(uchar8 p0, uchar8 p1, uchar8 p2, uchar8 p3, uchar8 p4, uchar8 p5, uchar8 p6, uchar8 p7, uchar8 p8)
Sorting network to sort 9 vectors of 8 elements and return their median.
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Interface for the kernel to apply a non-linear filter.
Iterator updated by execute_window_loop for each window element.
MatrixPattern
Available matrix patterns.
Any other matrix pattern.
Describe a multidimensional execution window.
NonLinearFilterFunction
Available non linear functions.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Non linear median filter.