44 inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist,
const uint32_t *local_hist,
size_t bins)
48 const unsigned int v_end = (bins / 4) * 4;
50 for(
unsigned int b = 0;
b < v_end;
b += 4)
52 const uint32x4_t tmp_global = vld1q_u32(global_hist +
b);
53 const uint32x4_t tmp_local = vld1q_u32(local_hist +
b);
54 vst1q_u32(global_hist +
b, vaddq_u32(tmp_global, tmp_local));
57 for(
unsigned int b = v_end;
b < bins; ++
b)
59 global_hist[
b] += local_hist[
b];
64 : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx()
72 const size_t bins = _output->
num_bins();
74 const uint32_t offrange = offset + _output->
range();
75 const uint32_t *
const w_lut = _window_lut;
76 uint32_t *
const local_hist = _local_hist + info.
thread_id * bins;
79 std::fill_n(local_hist, bins, 0);
81 auto update_local_hist = [&](uint8_t p)
83 if(offset <= p && p < offrange)
85 ++local_hist[w_lut[p]];
105 for(; x <= x_end - 8; x += 8)
107 const uint8x8_t pixels = vld1_u8(input.
ptr() + x);
109 update_local_hist(vget_lane_u8(pixels, 0));
110 update_local_hist(vget_lane_u8(pixels, 1));
111 update_local_hist(vget_lane_u8(pixels, 2));
112 update_local_hist(vget_lane_u8(pixels, 3));
113 update_local_hist(vget_lane_u8(pixels, 4));
114 update_local_hist(vget_lane_u8(pixels, 5));
115 update_local_hist(vget_lane_u8(pixels, 6));
116 update_local_hist(vget_lane_u8(pixels, 7));
120 for(; x <
x_end; ++x)
122 update_local_hist(input.
ptr()[x]);
128 merge_histogram(_output->
buffer(), local_hist, bins);
131 void NEHistogramKernel::histogram_fixed_U8(
Window win,
const ThreadInfo &info)
136 std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
154 for(; x <= x_end - 8; x += 8)
156 const uint8x8_t pixels = vld1_u8(input.
ptr() + x);
158 ++local_hist[vget_lane_u8(pixels, 0)];
159 ++local_hist[vget_lane_u8(pixels, 1)];
160 ++local_hist[vget_lane_u8(pixels, 2)];
161 ++local_hist[vget_lane_u8(pixels, 3)];
162 ++local_hist[vget_lane_u8(pixels, 4)];
163 ++local_hist[vget_lane_u8(pixels, 5)];
164 ++local_hist[vget_lane_u8(pixels, 6)];
165 ++local_hist[vget_lane_u8(pixels, 7)];
169 for(; x <
x_end; ++x)
171 ++local_hist[input.
ptr()[x]];
177 merge_histogram(_output->
buffer(), local_hist.data(), _max_range_size);
180 void NEHistogramKernel::calculate_window_lut()
const 183 const size_t bins = _output->
num_bins();
186 std::fill_n(_window_lut, offset, 0);
188 for(
unsigned int p = offset; p < _max_range_size; ++p)
204 _local_hist = local_hist;
205 _window_lut = window_lut;
211 ARM_COMPUTE_ERROR_ON_MSG(static_cast<int32_t>(_output->
range()) > static_cast<int32_t>(_max_range_size) ,
"Range larger than the image value range.");
214 calculate_window_lut();
217 _func = &NEHistogramKernel::histogram_U8;
221 INEKernel::configure(win);
234 _func = &NEHistogramKernel::histogram_fixed_U8;
238 INEKernel::configure(win);
247 (this->*_func)(window, info);
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
NEHistogramKernel()
Default constructor.
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
1 channel, 1 U8 per channel
uint32_t range() const
Returns the range of the distribution.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Describe one of the image's dimensions with a start, end and step.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual uint32_t * buffer() const =0
Returns a pointer to the start of the distribution.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
SimpleTensor< T > range(SimpleTensor< T > &dst, float start, const size_t num_of_elements, float step)
1D Distribution interface
Class to describe a number of elements in each dimension.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
size_t num_bins() const
Returns the number of bins that the distribution has.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
std::lock_guard< Mutex > lock_guard
Wrapper of lock_guard data-object.
constexpr int start() const
Return the start of the dimension.
void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
Set the input image and the distribution output.
Describe a multidimensional execution window.
int32_t offset() const
Returns the offset of the distribution.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.