Compute Library
 22.05
pooling_depthfirst.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "depthfirst_driver.hpp"
29 #include "utils.hpp"
30 
31 #include <alloca.h>
32 #include <limits>
33 
34 namespace arm_conv {
35 namespace pooling {
36 
37 template <typename TInput, typename TOutput>
39 {
40  unsigned int input_rows, input_cols, output_rows, output_cols;
41 
42  public:
43  DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols,
44  unsigned int stride_rows, unsigned int stride_cols,
45  unsigned int output_rows, unsigned int output_cols)
46  : input_rows(output_rows + (window_rows - 1) * stride_rows),
47  input_cols(output_cols + (window_cols - 1) * stride_cols),
48  output_rows(output_rows), output_cols(output_cols)
49  {
50  }
51 
52  unsigned int get_input_rows() const override { return input_rows; }
53  unsigned int get_input_cols() const override { return input_cols; }
54  unsigned int get_output_rows() const override { return output_rows; }
55  unsigned int get_output_cols() const override { return output_cols; }
56 
57  typedef void (*KernelType)(
58  unsigned int n_channels,
59  const TInput *const *,
60  TOutput *const *,
61  bool exclude_padding,
62  unsigned int pad_left,
63  unsigned int pad_top,
64  unsigned int pad_right,
65  unsigned int pad_bottom
66  );
67  virtual KernelType get_kernel(void) const = 0;
68 };
69 
70 
72 {
73  void *input_buffer;
75 };
76 
77 
78 template <typename TInput, typename TOutput=TInput, class OutputStage=Nothing>
79 class PoolingDepthfirst : public DepthfirstDriver<TInput, TOutput>
80 {
81  size_t sizeof_input_buffer(void) const
82  {
83  return sizeof(TInput) * this->m_args.n_channels;
84  }
85 
86  size_t sizeof_output_buffer(void) const
87  {
88  return sizeof(TOutput) * this->m_args.n_channels;
89  }
90 
91  protected:
92  /* Compute the amount of working space required for a single thread. */
93  size_t get_working_size_per_thread(unsigned int n_channels) const override
94  {
95  return sizeof(WorkingSpace) + n_channels * (sizeof(TInput) + sizeof(TOutput));
96  }
97 
98  /* Initialise the working space for a thread. */
99  void initialise_working_space(void *raw_ws, unsigned int n_channels) const override
100  {
101  auto ws = reinterpret_cast<WorkingSpace *>(raw_ws);
102  ws->input_buffer = ws + 1;
103  ws->output_buffer = reinterpret_cast<TInput *>(ws + 1) + n_channels;
104 
105  // Fill the input buffer with an appropriate value
106  TInput fill_val = 0;
107  if (this->m_args.pool_type == PoolingType::MAX)
108  {
109  using limits = std::numeric_limits<TInput>;
110  if (limits::has_infinity)
111  {
112  fill_val = -limits::infinity();
113  }
114  else
115  {
116  fill_val = limits::min();
117  }
118  }
119 
120  auto ptr = reinterpret_cast<TInput *>(ws->input_buffer);
121  for (; n_channels; n_channels--)
122  {
123  *(ptr++) = fill_val;
124  }
125  }
126 
127  /* Compute a portion of the output tensor with padding. */
128  void compute_tile_padded(
129  unsigned int output_i, unsigned int output_j,
130  unsigned int channel_start, unsigned int channel_end,
132  const TensorSpec<TOutput *> &output,
133  void *working_space
134  ) const override
135  {
136  const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
137  this->m_strat.get())->get_kernel();
138 
139  // Get the working space, and some space on the stack for pointer arrays
140  auto ws = reinterpret_cast<WorkingSpace *>(working_space);
141  auto inptr_array = reinterpret_cast<const TInput **>(alloca(
142  sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
143  auto outptr_array = reinterpret_cast<TOutput **>(alloca(
144  sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
145 
146  // Prepare the input pointers
147  const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
148  const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
149  const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
150 
151  const unsigned int end_ii = ii + this->m_strat->get_input_rows();
152  const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
153 
154  const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
155  const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
156  const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
157 
158  const unsigned int end_ij = ij + this->m_strat->get_input_cols();
159  const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
160 
161  fill_pointer_array<const TInput>(
162  inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
163  input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
164  input.ld_row, input.ld_col,
165  reinterpret_cast<const TInput *>(ws->input_buffer),
166  input_pad_top, this->m_args.input_rows - input_i,
167  input_pad_left, this->m_args.input_cols - input_j
168  );
169 
170  // Prepare the output pointers
172  outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
173  output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
174  output.ld_row, output.ld_col,
175  reinterpret_cast<TOutput *>(ws->output_buffer),
176  0, this->m_args.output_rows - output_i, // Top padding, # valid rows
177  0, this->m_args.output_cols - output_j // Left padding, # valid columns
178  );
179 
180  // Call the kernel
181  kern(
182  channel_end - channel_start, inptr_array, outptr_array,
183  this->m_args.exclude_padding,
184  input_pad_left, input_pad_top,
185  input_pad_right, input_pad_bottom
186  );
187  }
188 
189  // Compute a portion of the work with only top/bottom padding.
190  void compute_row_padded_tile_row(
191  const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
192  const unsigned int channel_start, const unsigned int channel_end,
193  const TensorSpec<const TInput *> &input,
194  const TensorSpec<TOutput *> &output,
195  void *working_space
196  ) const override
197  {
198  const auto kern = reinterpret_cast<const DepthfirstStrategy<TInput, TOutput> *>(
199  this->m_strat.get())->get_kernel();
200 
201  // Get the working space, and some space on the stack for pointer arrays
202  auto ws = reinterpret_cast<WorkingSpace *>(working_space);
203  auto inptr_array = reinterpret_cast<const TInput **>(alloca(
204  sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
205  auto outptr_array = reinterpret_cast<TOutput **>(alloca(
206  sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
207 
208  // Prepare the initial input pointers
209  const int ii = static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
210  const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
211  const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
212 
213  const unsigned int end_ii = ii + this->m_strat->get_input_rows();
214  const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
215 
216  const int ij = static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
217  const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
218 
219  const auto end_oi = output_i + this->m_strat->get_output_cols();
220  const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
221 
222  fill_pointer_array<const TInput>(
223  inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
224  input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start,
225  input.ld_row, input.ld_col,
226  reinterpret_cast<const TInput *>(ws->input_buffer),
227  input_pad_top, this->m_args.input_rows - input_i,
228  0, this->m_args.input_cols - input_j
229  );
230 
231  // Prepare the initial output pointers
233  outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
234  output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
235  output.ld_row, output.ld_col,
236  reinterpret_cast<TOutput *>(ws->output_buffer),
237  0, this->m_args.output_rows - output_i, // Top padding, # valid rows
238  0, this->m_args.output_cols - output_j // Left padding, # valid columns
239  );
240 
241  // Call the kernel
242  for (; n_tile_cols; n_tile_cols--)
243  {
244  kern(
245  channel_end - channel_start, inptr_array, outptr_array,
246  this->m_args.exclude_padding,
247  0, input_pad_top,
248  0, input_pad_bottom
249  );
250 
251  // Progress the input and output pointer arrays
252  const auto input_col_stride = input.ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
253  for (
254  auto n = input_pad_top * this->m_strat->get_input_cols();
255  n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
256  n++
257  )
258  {
259  inptr_array[n] += input_col_stride;
260  }
261 
262  const auto output_col_stride = output.ld_col * this->m_strat->get_output_cols();
263  for (
264  auto n = 0u;
265  n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
266  n++
267  )
268  {
269  outptr_array[n] += output_col_stride;
270  }
271  }
272  }
273 
274  public:
276  const PoolingArgs &args, const OutputStage &os = {})
277  : DepthfirstDriver<TInput, TOutput>(strat, args)
278  {
279  ARM_COMPUTE_UNUSED(os);
280  }
281 };
282 
283 } // namespace pooling
284 } // namespace arm_conv
#define MAX(x, y)
unsigned int get_input_cols() const override
template UniquePoolingCommon< float, float > pooling(const PoolingArgs &, const Nothing &)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols, unsigned int stride_rows, unsigned int stride_cols, unsigned int output_rows, unsigned int output_cols)
unsigned int get_output_rows() const override
unsigned int get_output_cols() const override
unsigned int get_input_rows() const override
void fill_pointer_array(size_t element_size, void **dest_raw, const unsigned int array_rows, const unsigned int array_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
Definition: addressing.cpp:32
virtual KernelType get_kernel(void) const =0
PoolingDepthfirst(const DepthfirstStrategy< TInput, TOutput > *strat, const PoolingArgs &args, const OutputStage &os={})
void(* KernelType)(unsigned int n_channels, const TInput *const *, TOutput *const *, bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom)
T ** outptr_array