Compute Library
 22.08
pooling_depthfirst_cache_oblivious.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #include "pool_common.hpp"
27 
28 #include <stack>
29 #include <vector>
30 
31 namespace arm_conv {
32 namespace pooling {
33 
34 template <class strategy>
35 class PoolingDepthfirstCacheOblivious : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type>
36 {
37  using TInput = typename strategy::operand_type;
38  using TOutput = typename strategy::return_type;
39 
40  const PoolingArgs m_args; // Copy of arguments
41 
42  constexpr static unsigned int input_rows(void)
43  {
44  return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
45  }
46 
47  constexpr static unsigned int input_cols(void)
48  {
49  return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
50  }
51 
52  size_t sizeof_input_buffer(void) const
53  {
54  return sizeof(TInput) * m_args.n_channels;
55  }
56 
57  size_t sizeof_output_buffer(void) const
58  {
59  return sizeof(TOutput) * m_args.n_channels;
60  }
61 
62  public:
63  PoolingDepthfirstCacheOblivious(const PoolingArgs &args) : m_args(args)
64  {
65  }
66 
69 
70  size_t get_working_size(void) const override
71  {
72  // We require an array of pointers for the inputs and outputs, a
73  // channel-length vector in which to dump surplus output, and a
74  // channel-length vector of padding values.
75  return sizeof_input_buffer() + sizeof_output_buffer();
76  }
77 
78  void execute(
79  const void *const input,
80  void *const output,
81  void *const working_space
82  ) const override
83  {
84  const size_t ld_input_col = m_args.n_channels;
85  const size_t ld_input_row = ld_input_col * m_args.input_cols;
86  const size_t ld_input_batch = ld_input_row * m_args.input_rows;
87  const size_t ld_output_col = ld_input_col;
88  const size_t ld_output_row = ld_output_col * m_args.output_cols;
89  const size_t ld_output_batch = ld_output_row * m_args.output_rows;
90 
91  execute(
92  input, ld_input_col, ld_input_row, ld_input_batch,
93  output, ld_output_col, ld_output_row, ld_output_batch,
94  working_space
95  );
96  }
97 
98  void execute(
99  const void *const input,
100  size_t ld_input_col,
101  size_t ld_input_row,
102  size_t ld_input_batch,
103  void *const output,
104  size_t ld_output_col,
105  size_t ld_output_row,
106  size_t ld_output_batch,
107  void *const working_space
108  ) const override
109  {
110  execute(
111  m_args.n_batches, m_args.input_rows, m_args.input_cols,
112  m_args.n_channels,
113  input, ld_input_col, ld_input_row, ld_input_batch,
114  m_args.padding,
115  m_args.output_rows, m_args.output_cols,
116  output, ld_output_col, ld_output_row, ld_output_batch,
117  working_space
118  );
119  }
120 
121  void execute(
122  unsigned int batches,
123  unsigned int input_height,
124  unsigned int input_width,
125  unsigned int channels,
126  const void *const _input,
127  size_t ld_input_col,
128  size_t ld_input_row,
129  size_t ld_input_batch,
130  const PaddingValues &padding,
131  unsigned int output_height,
132  unsigned int output_width,
133  void *const _output,
134  size_t ld_output_col,
135  size_t ld_output_row,
136  size_t ld_output_batch,
137  void *const _working_space
138  ) const override
139  {
140  strategy strat(m_args.cpu_info);
141 #ifdef CYCLE_PROFILING
142  arm_gemm::profiler prof;
143 #endif // CYCLE_PROFILING
144 
145  // Cast input and output pointers into the right types
146  const TInput *const inptr = static_cast<const TInput *>(_input);
147  TOutput *const outptr = static_cast<TOutput *>(_output);
148 
149  // Allocate portions of the working space
150  uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
151  TOutput *const output_buffer = reinterpret_cast<TOutput *>(working_space);
152  TInput *const input_buffer = reinterpret_cast<TInput *>(working_space + sizeof_output_buffer());
153 
154  // Fill the input buffer
155  const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
156  ? static_cast<TInput>(0)
157  : (std::numeric_limits<TInput>::has_infinity
158  ? -std::numeric_limits<TInput>::infinity()
160  for (unsigned int i = 0; i < channels; i++)
161  {
162  input_buffer[i] = pad_value;
163  }
164 
165  // Keep subdividing the output plane across the longest dimension until we
166  // reach the size of the tile. Queue items for later processing. Note - we
167  // can determine the largest size of the queue a priori from the input
168  // tensor size, this would allow us to allocate memory within the working
169  // space and improve performance.
170  struct WorkItem
171  {
172  unsigned int output_i, output_j;
173  unsigned int output_height, output_width;
174 
175  WorkItem(unsigned int i, unsigned int j, unsigned int height, unsigned int width)
176  : output_i(i), output_j(j), output_height(height), output_width(width) {}
177  };
178 
179  auto execute = [&] (const WorkItem &item) {
180  // Create an array for the output pointers
181  TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
182  TOutput **const outptr_array = _outptr_array;
183 
184  // Construct the output pointer array
185  {
186  const auto output_pad_right = strategy::out_rows() - item.output_width;
187  auto outptr_element = outptr_array;
188  auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
189 
190  // Fill the array with pointers to the output buffer
191  for (unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
192  {
193  outptr_array[i] = output_buffer;
194  }
195 
196  // Fill in the valid portion of the array
197  for (unsigned int i = 0; i < item.output_height; i++)
198  {
199  auto outptr_col = outptr_row;
200  for (unsigned int j = 0; j < item.output_width; j++)
201  {
202  *(outptr_element++) = outptr_col;
203  outptr_col += ld_output_col;
204  }
205  outptr_element += output_pad_right;
206  outptr_row += ld_output_row;
207  }
208  }
209 
210  const int start_i = item.output_i * strategy::stride_rows() - padding.top;
211  const int end_i = start_i + input_rows();
212  const unsigned int pad_top = std::max(0, 0 - start_i);
213  const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
214 
215  const int start_j = item.output_j * strategy::stride_cols() - padding.left;
216  const int end_j = start_j + input_cols();
217  const unsigned int pad_left = std::max(0, 0 - start_j);
218  const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
219 
220  // Create an array for the input pointers
221  const TInput * _inptr_array[input_rows() * input_cols()];
222  const TInput **const inptr_array = _inptr_array;
223  {
224  const unsigned int row_padding = pad_top + pad_bottom;
225  const unsigned int valid_rows = input_rows() - row_padding;
226 
227  const unsigned int col_padding = pad_left + pad_right;
228  const unsigned int valid_cols = input_cols() - col_padding;
229 
230  // Fill the array with pointers to the input buffer
231  for (unsigned int i = 0; i < input_rows() * input_cols(); i++)
232  {
233  inptr_array[i] = input_buffer;
234  }
235 
236  // Compute valid initial pointer
237  auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
238 
239  // Fill in the valid portion of the input array
240  auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
241  for (unsigned int i = 0; i < valid_rows; i++)
242  {
243  auto inptr_col = inptr_row;
244  for (unsigned int j = 0; j < valid_cols; j++)
245  {
246  *(inptr_element++) = inptr_col;
247  inptr_col += ld_input_col;
248  }
249 
250  inptr_row += ld_input_row;
251  inptr_element += col_padding; // Skip the padding elements
252  }
253  }
254 
255  // Call the kernel
256 #ifdef CYCLE_PROFILING
257  // TODO Work number
258  auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
259 #endif // CYCLE_PROFILING
260  strat.kernel(channels, inptr_array, outptr_array,
261  pad_left, pad_top, pad_right, pad_bottom);
262  };
263 
264  // Add the initial work item to the stack of work.
265  std::stack<WorkItem, std::vector<WorkItem>> stack;
266  stack.push(WorkItem(0, 0, output_height, output_width));
267  while (!stack.empty())
268  {
269  // Pop an item from the stack, bisect the largest dimension and either
270  // execute the resulting tiles or add them to the stack if they are too
271  // large.
272  const WorkItem item(stack.top());
273  stack.pop();
274 
275  if (item.output_height <= strategy::out_rows() &&
276  item.output_width <= strategy::out_cols())
277  {
278  execute(item);
279  }
280  else
281  {
282  // Split the largest dimension, such that we get an exact number of
283  // tiles in the first partition.
284  if (item.output_height >= item.output_width)
285  {
286  const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
287  const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
288 
289  const unsigned int height_first = tiles_first * strategy::out_rows();
290  const unsigned int height_second = item.output_height - height_first;
291 
292  stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
293  stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
294  }
295  else
296  {
297  const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
298  const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
299 
300  const unsigned int width_first = tiles_first * strategy::out_cols();
301  const unsigned int width_second = item.output_width - width_first;
302 
303  stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
304  stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
305  }
306  }
307  }
308  }
309 };
310 
311 } // namespace pooling
312 } // namespace arm_conv
PoolingDepthfirstCacheOblivious & operator=(PoolingDepthfirstCacheOblivious &)=delete
T * input_buffer
T * output_buffer
const size_t input_height
Definition: impl.cpp:61
const size_t input_width
Definition: impl.cpp:62
template UniquePoolingCommon< float, float > pooling(const PoolingArgs &, const Nothing &)
void execute(unsigned int batches, unsigned int input_height, unsigned int input_width, unsigned int channels, const void *const _input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const PaddingValues &padding, unsigned int output_height, unsigned int output_width, void *const _output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const _working_space) const override
const StratType * strategy
void execute(const void *const input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, void *const output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const working_space) const override
unsigned int batches
void execute(const void *const input, void *const output, void *const working_space) const override
T ** outptr_array