Compute Library
 21.02
NEHistogramKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
31 #include "arm_compute/core/Types.h"
35 
36 #include <algorithm>
37 #include <arm_neon.h>
38 #include <array>
39 
40 namespace arm_compute
41 {
42 class Coordinates;
43 
44 inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
45 {
47 
48  const unsigned int v_end = (bins / 4) * 4;
49 
50  for(unsigned int b = 0; b < v_end; b += 4)
51  {
52  const uint32x4_t tmp_global = vld1q_u32(global_hist + b);
53  const uint32x4_t tmp_local = vld1q_u32(local_hist + b);
54  vst1q_u32(global_hist + b, vaddq_u32(tmp_global, tmp_local));
55  }
56 
57  for(unsigned int b = v_end; b < bins; ++b)
58  {
59  global_hist[b] += local_hist[b];
60  }
61 }
62 
64  : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx()
65 {
66 }
67 
68 void NEHistogramKernel::histogram_U8(Window win, const ThreadInfo &info)
69 {
70  ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
71 
72  const size_t bins = _output->num_bins();
73  const int32_t offset = _output->offset();
74  const uint32_t offrange = offset + _output->range();
75  const uint32_t *const w_lut = _window_lut;
76  uint32_t *const local_hist = _local_hist + info.thread_id * bins;
77 
78  // Clear local_histogram
79  std::fill_n(local_hist, bins, 0);
80 
81  auto update_local_hist = [&](uint8_t p)
82  {
83  if(offset <= p && p < offrange)
84  {
85  ++local_hist[w_lut[p]];
86  }
87  };
88 
89  const int x_start = win.x().start();
90  const int x_end = win.x().end();
91 
92  // Handle X dimension manually to split into two loops
93  // First one will use vector operations, second one processes the left over
94  // pixels
95  win.set(Window::DimX, Window::Dimension(0, 1, 1));
96 
97  Iterator input(_input, win);
98 
99  // Calculate local histogram
100  execute_window_loop(win, [&](const Coordinates &)
101  {
102  int x = x_start;
103 
104  // Vector loop
105  for(; x <= x_end - 8; x += 8)
106  {
107  const uint8x8_t pixels = vld1_u8(input.ptr() + x);
108 
109  update_local_hist(vget_lane_u8(pixels, 0));
110  update_local_hist(vget_lane_u8(pixels, 1));
111  update_local_hist(vget_lane_u8(pixels, 2));
112  update_local_hist(vget_lane_u8(pixels, 3));
113  update_local_hist(vget_lane_u8(pixels, 4));
114  update_local_hist(vget_lane_u8(pixels, 5));
115  update_local_hist(vget_lane_u8(pixels, 6));
116  update_local_hist(vget_lane_u8(pixels, 7));
117  }
118 
119  // Process leftover pixels
120  for(; x < x_end; ++x)
121  {
122  update_local_hist(input.ptr()[x]);
123  }
124  },
125  input);
126 
127  // Merge histograms
128  merge_histogram(_output->buffer(), local_hist, bins);
129 }
130 
131 void NEHistogramKernel::histogram_fixed_U8(Window win, const ThreadInfo &info)
132 {
133  ARM_COMPUTE_UNUSED(info);
134  ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
135 
136  std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
137 
138  const int x_start = win.x().start();
139  const int x_end = win.x().end();
140 
141  // Handle X dimension manually to split into two loops
142  // First one will use vector operations, second one processes the left over
143  // pixels
144  win.set(Window::DimX, Window::Dimension(0, 1, 1));
145 
146  Iterator input(_input, win);
147 
148  // Calculate local histogram
149  execute_window_loop(win, [&](const Coordinates &)
150  {
151  int x = x_start;
152 
153  // Vector loop
154  for(; x <= x_end - 8; x += 8)
155  {
156  const uint8x8_t pixels = vld1_u8(input.ptr() + x);
157 
158  ++local_hist[vget_lane_u8(pixels, 0)];
159  ++local_hist[vget_lane_u8(pixels, 1)];
160  ++local_hist[vget_lane_u8(pixels, 2)];
161  ++local_hist[vget_lane_u8(pixels, 3)];
162  ++local_hist[vget_lane_u8(pixels, 4)];
163  ++local_hist[vget_lane_u8(pixels, 5)];
164  ++local_hist[vget_lane_u8(pixels, 6)];
165  ++local_hist[vget_lane_u8(pixels, 7)];
166  }
167 
168  // Process leftover pixels
169  for(; x < x_end; ++x)
170  {
171  ++local_hist[input.ptr()[x]];
172  }
173  },
174  input);
175 
176  // Merge histograms
177  merge_histogram(_output->buffer(), local_hist.data(), _max_range_size);
178 }
179 
180 void NEHistogramKernel::calculate_window_lut() const
181 {
182  const int32_t offset = _output->offset();
183  const size_t bins = _output->num_bins();
184  const uint32_t range = _output->range();
185 
186  std::fill_n(_window_lut, offset, 0);
187 
188  for(unsigned int p = offset; p < _max_range_size; ++p)
189  {
190  _window_lut[p] = ((p - offset) * bins) / range;
191  }
192 }
193 
194 void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
195 {
198  ARM_COMPUTE_ERROR_ON(nullptr == output);
199  ARM_COMPUTE_ERROR_ON(nullptr == local_hist);
200  ARM_COMPUTE_ERROR_ON(nullptr == window_lut);
201 
202  _input = input;
203  _output = output;
204  _local_hist = local_hist;
205  _window_lut = window_lut;
206 
207  //Check offset
208  ARM_COMPUTE_ERROR_ON_MSG(0 > _output->offset() || _output->offset() > static_cast<int32_t>(_max_range_size), "Offset is larger than the image value range.");
209 
210  //Check range
211  ARM_COMPUTE_ERROR_ON_MSG(static_cast<int32_t>(_output->range()) > static_cast<int32_t>(_max_range_size) /* max range */, "Range larger than the image value range.");
212 
213  // Calculate LUT
214  calculate_window_lut();
215 
216  // Set appropriate function
217  _func = &NEHistogramKernel::histogram_U8;
218 
219  Window win = calculate_max_window(*input->info(), Steps());
220 
221  INEKernel::configure(win);
222 }
223 
225 {
228  ARM_COMPUTE_ERROR_ON(nullptr == output);
229 
230  _input = input;
231  _output = output;
232 
233  // Set appropriate function
234  _func = &NEHistogramKernel::histogram_fixed_U8;
235 
236  Window win = calculate_max_window(*input->info(), Steps());
237 
238  INEKernel::configure(win);
239 }
240 
242 {
245  ARM_COMPUTE_ERROR_ON(_func == nullptr);
246 
247  (this->*_func)(window, info);
248 }
249 } // namespace arm_compute
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:846
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
NEHistogramKernel()
Default constructor.
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Definition: Validate.h:856
SimpleTensor< float > b
Definition: DFT.cpp:157
1 channel, 1 U8 per channel
uint32_t range() const
Returns the range of the distribution.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual uint32_t * buffer() const =0
Returns a pointer to the start of the distribution.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
SimpleTensor< T > range(SimpleTensor< T > &dst, float start, const size_t num_of_elements, float step)
Definition: Range.cpp:50
1D Distribution interface
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
Coordinates of an item.
Definition: Coordinates.h:37
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
size_t num_bins() const
Returns the number of bins that the distribution has.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
std::lock_guard< Mutex > lock_guard
Wrapper of lock_guard data-object.
Definition: Mutex.h:37
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
Set the input image and the distribution output.
Describe a multidimensional execution window.
Definition: Window.h:39
int32_t offset() const
Returns the offset of the distribution.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145