Compute Library
 21.05
CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 
29 #include <algorithm>
30 #include <cmath>
31 
32 namespace arm_compute
33 {
34 namespace
35 {
36 template <typename T>
37 std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
38 {
39  std::vector<int> keep;
40  const int proposals_width = proposals->info()->dimension(1);
41 
42  std::vector<T> x1(proposals_width);
43  std::vector<T> y1(proposals_width);
44  std::vector<T> x2(proposals_width);
45  std::vector<T> y2(proposals_width);
46  std::vector<T> areas(proposals_width);
47 
48  for(int i = 0; i < proposals_width; ++i)
49  {
50  x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
51  y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
52  x2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
53  y2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
54  areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
55  }
56 
57  // Note: Soft NMS scores have already been initialized with input scores
58 
59  while(!inds.empty())
60  {
61  // Find proposal with max score among remaining proposals
62  int max_pos = 0;
63  for(unsigned int i = 1; i < inds.size(); ++i)
64  {
65  if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
66  {
67  max_pos = i;
68  }
69  }
70  int element = inds.at(max_pos);
71  keep.push_back(element);
72  std::swap(inds.at(0), inds.at(max_pos));
73 
74  // Remove first element and compute IoU of the remaining boxes with identified max box
75  inds.erase(inds.begin());
76 
77  std::vector<int> sorted_indices_temp;
78  for(auto idx : inds)
79  {
80  const auto xx1 = std::max(x1[idx], x1[element]);
81  const auto yy1 = std::max(y1[idx], y1[element]);
82  const auto xx2 = std::min(x2[idx], x2[element]);
83  const auto yy2 = std::min(y2[idx], y2[element]);
84 
85  const auto w = std::max((xx2 - xx1 + 1.f), 0.f);
86  const auto h = std::max((yy2 - yy1 + 1.f), 0.f);
87  const auto inter = w * h;
88  const auto ovr = inter / (areas[element] + areas[idx] - inter);
89 
90  // Update scores based on computed IoU, overlap threshold and NMS method
91  T weight;
92  switch(info.soft_nms_method())
93  {
94  case NMSType::LINEAR:
95  weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
96  break;
97  case NMSType::GAUSSIAN: // Gaussian
98  weight = std::exp(-1.f * ovr * ovr / info.soft_nms_sigma());
99  break;
100  case NMSType::ORIGINAL: // Original NMS
101  weight = (ovr > info.nms()) ? 0.f : 1.f;
102  break;
103  default:
104  ARM_COMPUTE_ERROR("Not supported");
105  }
106 
107  // Discard boxes with new scores below min threshold and update pending indices
108  scores_in[class_id][idx] *= weight;
109  if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
110  {
111  sorted_indices_temp.push_back(idx);
112  }
113  }
114  inds = sorted_indices_temp;
115  }
116 
117  return keep;
118 }
119 
120 template <typename T>
121 std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
122 {
123  std::vector<int> keep;
124 
125  const int proposals_width = proposals->info()->dimension(1);
126 
127  std::vector<T> x1(proposals_width);
128  std::vector<T> y1(proposals_width);
129  std::vector<T> x2(proposals_width);
130  std::vector<T> y2(proposals_width);
131  std::vector<T> areas(proposals_width);
132 
133  for(int i = 0; i < proposals_width; ++i)
134  {
135  x1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
136  y1[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
137  x2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
138  y2[i] = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
139  areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
140  }
141 
142  while(!sorted_indices.empty())
143  {
144  int i = sorted_indices.at(0);
145  keep.push_back(i);
146 
147  std::vector<int> sorted_indices_temp = sorted_indices;
148  std::vector<int> new_indices;
149  sorted_indices_temp.erase(sorted_indices_temp.begin());
150 
151  for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
152  {
153  const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
154  const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
155  const float xx2 = std::min(x2[sorted_indices_temp.at(j)], x2[i]);
156  const float yy2 = std::min(y2[sorted_indices_temp.at(j)], y2[i]);
157 
158  const float w = std::max((xx2 - xx1 + 1.f), 0.f);
159  const float h = std::max((yy2 - yy1 + 1.f), 0.f);
160  const float inter = w * h;
161  const float ovr = inter / (areas[i] + areas[sorted_indices_temp.at(j)] - inter);
162  const float ctr_x = xx1 + (w / 2);
163  const float ctr_y = yy1 + (h / 2);
164 
165  // If suppress_size is specified, filter the boxes based on their size and position
166  const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
167  if(ovr <= info.nms() && keep_size)
168  {
169  new_indices.push_back(j);
170  }
171  }
172 
173  const unsigned int new_indices_size = new_indices.size();
174  std::vector<int> new_sorted_indices(new_indices_size);
175  for(unsigned int i = 0; i < new_indices_size; ++i)
176  {
177  new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
178  }
179  sorted_indices = new_sorted_indices;
180  }
181 
182  return keep;
183 }
184 } // namespace
185 
187  : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
188  _info()
189 {
190 }
191 
193 {
194  return false;
195 }
196 
197 template <typename T>
199 {
200  const int batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
201  const int num_classes = _scores_in->info()->dimension(0);
202  const int scores_count = _scores_in->info()->dimension(1);
203  std::vector<int> total_keep_per_batch(batch_size);
204  std::vector<std::vector<int>> keeps(num_classes);
205  int total_keep_count = 0;
206 
207  std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
208  for(int i = 0; i < scores_count; ++i)
209  {
210  for(int j = 0; j < num_classes; ++j)
211  {
212  in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
213  }
214  }
215 
216  int offset = 0;
217  int cur_start_idx = 0;
218  for(int b = 0; b < batch_size; ++b)
219  {
220  const int num_boxes = _batch_splits_in == nullptr ? 1 : static_cast<int>(*reinterpret_cast<T *>(_batch_splits_in->ptr_to_element(Coordinates(b))));
221  // Skip first class if there is more than 1 except if the number of classes is 1.
222  const int j_start = (num_classes == 1 ? 0 : 1);
223  for(int j = j_start; j < num_classes; ++j)
224  {
225  std::vector<T> cur_scores(scores_count);
226  std::vector<int> inds;
227  for(int i = 0; i < scores_count; ++i)
228  {
229  const T score = in_scores[j][i];
230  cur_scores[i] = score;
231 
232  if(score > _info.score_thresh())
233  {
234  inds.push_back(i);
235  }
236  }
237  if(_info.soft_nms_enabled())
238  {
239  keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
240  }
241  else
242  {
243  std::sort(inds.data(), inds.data() + inds.size(),
244  [&cur_scores](int lhs, int rhs)
245  {
246  return cur_scores[lhs] > cur_scores[rhs];
247  });
248 
249  keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
250  }
251  total_keep_count += keeps[j].size();
252  }
253 
254  if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
255  {
256  // merge all scores (represented by indices) together and sort
257  auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
258  {
259  std::vector<T> ret(total_keep_count);
260 
261  int ret_idx = 0;
262  for(unsigned int i = 1; i < keeps.size(); ++i)
263  {
264  auto &cur_keep = keeps[i];
265  for(auto &ckv : cur_keep)
266  {
267  ret[ret_idx++] = in_scores[i][ckv];
268  }
269  }
270 
271  std::sort(ret.data(), ret.data() + ret.size());
272 
273  return ret;
274  };
275 
276  auto all_scores_sorted = get_all_scores_sorted();
277  const T image_thresh = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
278  for(int j = 1; j < num_classes; ++j)
279  {
280  auto &cur_keep = keeps[j];
281  std::vector<int> new_keeps_j;
282  for(auto &k : cur_keep)
283  {
284  if(in_scores[j][k] >= image_thresh)
285  {
286  new_keeps_j.push_back(k);
287  }
288  }
289  keeps[j] = new_keeps_j;
290  }
291  total_keep_count = _info.detections_per_im();
292  }
293 
294  total_keep_per_batch[b] = total_keep_count;
295 
296  // Write results
297  int cur_out_idx = 0;
298  for(int j = j_start; j < num_classes; ++j)
299  {
300  auto &cur_keep = keeps[j];
301  auto cur_out_scores = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
302  auto cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
303  const int box_column = (cur_start_idx + cur_out_idx) * 4;
304 
305  for(unsigned int k = 0; k < cur_keep.size(); ++k)
306  {
307  cur_out_scores[k] = in_scores[j][cur_keep[k]];
308  cur_out_classes[k] = static_cast<T>(j);
309  auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
310  auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
311  auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
312  auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
313  *cur_out_box_row0 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
314  *cur_out_box_row1 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
315  *cur_out_box_row2 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
316  *cur_out_box_row3 = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
317  }
318 
319  cur_out_idx += cur_keep.size();
320  }
321 
322  if(_keeps != nullptr)
323  {
324  cur_out_idx = 0;
325  for(int j = 0; j < num_classes; ++j)
326  {
327  for(unsigned int i = 0; i < keeps[j].size(); ++i)
328  {
329  *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
330  }
331  *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
332  cur_out_idx += keeps[j].size();
333  }
334  }
335 
336  offset += num_boxes;
337  cur_start_idx += total_keep_count;
338  }
339 
340  if(_batch_splits_out != nullptr)
341  {
342  for(int b = 0; b < batch_size; ++b)
343  {
344  *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
345  }
346  }
347 }
348 
349 void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
350  ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
351 {
352  ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
354  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, boxes_in, scores_out);
355  const unsigned int num_classes = scores_in->info()->dimension(0);
356 
357  ARM_COMPUTE_UNUSED(num_classes);
358  ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
359  ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
360 
361  ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
362  ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
363  ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
364  if(keeps != nullptr)
365  {
366  ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
369  ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
370  ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
371  }
372  if(batch_splits_in != nullptr)
373  {
374  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
375  }
376  if(batch_splits_out != nullptr)
377  {
378  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
379  }
380 
381  _scores_in = scores_in;
382  _boxes_in = boxes_in;
383  _batch_splits_in = batch_splits_in;
384  _scores_out = scores_out;
385  _boxes_out = boxes_out;
386  _classes = classes;
387  _batch_splits_out = batch_splits_out;
388  _keeps = keeps;
389  _keeps_size = keeps_size;
390  _info = info;
391 
392  // Configure kernel window
393  Window win = calculate_max_window(*scores_in->info(), Steps(scores_in->info()->dimension(0)));
394 
395  IKernel::configure(win);
396 }
397 
399 {
404 
405  switch(_scores_in->info()->data_type())
406  {
407  case DataType::F32:
408  run_nmslimit<float>();
409  break;
410  case DataType::F16:
411  run_nmslimit<half>();
412  break;
413  default:
414  ARM_COMPUTE_ERROR("Not supported");
415  }
416 }
417 } // namespace arm_compute
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:861
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
SimpleTensor< float > w
Definition: DFT.cpp:156
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out=nullptr, ITensor *keeps=nullptr, ITensor *keeps_size=nullptr, const BoxNMSLimitInfo info=BoxNMSLimitInfo())
Initialise the kernel's input and output tensors.
float score_thresh() const
Get the score threshold.
Definition: Types.h:580
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
SimpleTensor< float > b
Definition: DFT.cpp:157
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
BoxWithNonMaximaSuppressionLimit Information class.
Definition: Types.h:554
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
bool is_parallelisable() const override
Indicates whether or not the kernel is parallelisable.
Interface for CPU tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w)
Definition: Validate.h:179
1 channel, 1 U32 per channel
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:539
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
Coordinates of an item.
Definition: Coordinates.h:37
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:786
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:252
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
void swap(Window &lhs, Window &rhs)
Definition: Window.inl:304
Describe a multidimensional execution window.
Definition: Window.h:39
bool soft_nms_enabled() const
Check if soft NMS is enabled.
Definition: Types.h:595
int detections_per_im() const
Get the number of detections.
Definition: Types.h:590