Compute Library
 22.11
pooling_depthfirst.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "depthfirst_driver.hpp"
29 #include "utils.hpp"
30 #if !defined(_WIN64) && !defined(__OpenBSD__)
31 #include <alloca.h>
32 #endif /* !defined(_WIN64) && !defined(__OpenBSD__) */
33 #include <limits>
34 
35 namespace arm_conv {
36 namespace pooling {
37 
38 template <typename TInput, typename TOutput>
40 {
41  unsigned int input_rows, input_cols, output_rows, output_cols;
42 
43  public:
44  DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols,
45  unsigned int stride_rows, unsigned int stride_cols,
46  unsigned int output_rows, unsigned int output_cols)
47  : input_rows(output_rows + (window_rows - 1) * stride_rows),
48  input_cols(output_cols + (window_cols - 1) * stride_cols),
49  output_rows(output_rows), output_cols(output_cols)
50  {
51  }
52 
53  unsigned int get_input_rows() const override { return input_rows; }
54  unsigned int get_input_cols() const override { return input_cols; }
55  unsigned int get_output_rows() const override { return output_rows; }
56  unsigned int get_output_cols() const override { return output_cols; }
57 
58  typedef void (*KernelType)(
59  unsigned int n_channels,
60  const TInput *const *,
61  TOutput *const *,
62  bool exclude_padding,
63  unsigned int pad_left,
64  unsigned int pad_top,
65  unsigned int pad_right,
66  unsigned int pad_bottom
67  );
68  virtual KernelType get_kernel(void) const = 0;
69 };
70 
71 
73 {
74  void *input_buffer;
76 };
77 
78 
79 template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing>
80 class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
81 {
82  size_t sizeof_input_buffer(void) const
83  {
84  return sizeof(TInput) * this->m_args.n_channels;
85  }
86 
87  size_t sizeof_output_buffer(void) const
88  {
89  return sizeof(TOutput) * this->m_args.n_channels;
90  }
91 
92  protected:
93  /* Compute the amount of working space required for a single thread. */
94  size_t get_working_size_per_thread(unsigned int n_channels) const override
95  {
96  return sizeof(WorkingSpace) + n_channels * (sizeof(TInput) + sizeof(TOutput));
97  }
98 
99  /* Initialise the working space for a thread. */
100  void initialise_working_space(void *raw_ws, unsigned int n_channels) const override
101  {
102  auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
103  ws->input_buffer = ws + 1;
104  ws->output_buffer = reinterpret_cast<TInput *>(ws + 1) + n_channels;
105 
106  // Fill the input buffer with an appropriate value
107  TInput fill_val = 0;
108  if (this->m_args.pool_type == PoolingType::MAX)
109  {
110  using limits = std::numeric_limits<TInput>;
111  if (limits::has_infinity)
112  {
113  fill_val = -limits::infinity();
114  }
115  else
116  {
117  fill_val = limits::min();
118  }
119  }
120 
121  auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
122  for (; n_channels; n_channels--)
123  {
124  *(ptr++) = fill_val;
125  }
126  }
127 
128  /* Compute a portion of the output tensor with padding. */
129  void compute_tile_padded(
130  unsigned int output_i, unsigned int output_j,
131  unsigned int channel_start, unsigned int channel_end,
133  const TensorSpec<TOutput *> &output,
134  void *working_space
135  ) const override
136  {
137  const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
138  this->m_strat.get())->get_kernel();
139 
140  // Get the working space, and some space on the stack for pointer arrays
141  auto ws = reinterpret_cast<WorkingSpace *>(working_space);
142  auto inptr_array = reinterpret_cast<const TInput **>(alloca(
143  sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
144  auto outptr_array = reinterpret_cast<TOutput **>(alloca(
145  sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
146 
147  // Prepare the input pointers
148  const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
149  const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
150  const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
151 
152  const unsigned int end_ii = ii + this->m_strat->get_input_rows();
153  const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
154 
155  const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
156  const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
157  const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
158 
159  const unsigned int end_ij = ij + this->m_strat->get_input_cols();
160  const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
161 
162  fill_pointer_array<const TInput>(
163  inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
164  input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
165  input.ld_row, input.ld_col,
166  reinterpret_cast<const TInput *>(ws->input_buffer),
167  input_pad_top, this->m_args.input_rows - input_i,
168  input_pad_left, this->m_args.input_cols - input_j
169  );
170 
171  // Prepare the output pointers
173  outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
174  output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
175  output.ld_row, output.ld_col,
176  reinterpret_cast<TOutput *>(ws->output_buffer),
177  0, this->m_args.output_rows - output_i, // Top padding, # valid rows
178  0, this->m_args.output_cols - output_j // Left padding, # valid columns
179  );
180 
181  // Call the kernel
182  kern(
183  channel_end - channel_start, inptr_array, outptr_array,
184  this->m_args.exclude_padding,
185  input_pad_left, input_pad_top,
186  input_pad_right, input_pad_bottom
187  );
188  }
189 
190  // Compute a portion of the work with only top/bottom padding.
191  void compute_row_padded_tile_row(
192  const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
193  const unsigned int channel_start, const unsigned int channel_end,
194  const TensorSpec<const TInput *> &input,
195  const TensorSpec<TOutput *> &output,
196  void *working_space
197  ) const override
198  {
199  const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
200  this->m_strat.get())->get_kernel();
201 
202  // Get the working space, and some space on the stack for pointer arrays
203  auto ws = reinterpret_cast<WorkingSpace *>(working_space);
204  auto inptr_array = reinterpret_cast<const TInput **>(alloca(
205  sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
206  auto outptr_array = reinterpret_cast<TOutput **>(alloca(
207  sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
208 
209  // Prepare the initial input pointers
210  const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
211  const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
212  const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
213 
214  const unsigned int end_ii = ii + this->m_strat->get_input_rows();
215  const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
216 
217  const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
218  const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
219 
220  const auto end_oi = output_i + this->m_strat->get_output_cols();
221  const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
222 
223  fill_pointer_array<const TInput>(
224  inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
225  input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
226  input.ld_row, input.ld_col,
227  reinterpret_cast<const TInput *>(ws->input_buffer),
228  input_pad_top, this->m_args.input_rows - input_i,
229  0, this->m_args.input_cols - input_j
230  );
231 
232  // Prepare the initial output pointers
234  outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
235  output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
236  output.ld_row, output.ld_col,
237  reinterpret_cast<TOutput *>(ws->output_buffer),
238  0, this->m_args.output_rows - output_i, // Top padding, # valid rows
239  0, this->m_args.output_cols - output_j // Left padding, # valid columns
240  );
241 
242  // Call the kernel
243  for (; n_tile_cols; n_tile_cols--)
244  {
245  kern(
246  channel_end - channel_start, inptr_array, outptr_array,
247  this->m_args.exclude_padding,
248  0, input_pad_top,
249  0, input_pad_bottom
250  );
251 
252  // Progress the input and output pointer arrays
253  const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
254  for (
255  auto n = input_pad_top * this->m_strat->get_input_cols();
256  n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
257  n++
258  )
259  {
260  inptr_array[n] += input_col_stride;
261  }
262 
263  const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols();
264  for (
265  auto n = 0u;
266  n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
267  n++
268  )
269  {
270  outptr_array[n] += output_col_stride;
271  }
272  }
273  }
274 
275  public:
277  const PoolingArgs &args, const OutputStage &os = {})
278  : DepthfirstDriver<TInput, TOutput>(strat, args)
279  {
280  ARM_COMPUTE_UNUSED(os);
281  }
282 };
283 
284 } // namespace pooling
285 } // namespace arm_conv
#define MAX(x, y)
unsigned int get_input_cols() const override
template UniquePoolingCommon< float, float > pooling(const PoolingArgs &, const Nothing &)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols, unsigned int stride_rows, unsigned int stride_cols, unsigned int output_rows, unsigned int output_cols)
unsigned int get_output_rows() const override
unsigned int get_output_cols() const override
unsigned int get_input_rows() const override
void fill_pointer_array(size_t element_size, void **dest_raw, const unsigned int array_rows, const unsigned int array_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
Definition: addressing.cpp:32
virtual KernelType get_kernel(void) const =0
PoolingDepthfirst(const DepthfirstStrategy< TInput, TOutput > *strat, const PoolingArgs &args, const OutputStage &os={})
void(* KernelType)(unsigned int n_channels, const TInput *const *, TOutput *const *, bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom)
T ** outptr_array