Compute Library
 22.08
pooling_depthfirst_generic_quantized.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "pool_common.hpp"
28 #include "utils.hpp"
29 
30 namespace arm_conv {
31 namespace pooling {
32 
33 template <class strategy>
34 class PoolingDepthfirstGenericQuantized : public PoolingCommon<typename strategy::operand_type, typename strategy::return_type, Requantize32>
35 {
36  using TInput = typename strategy::operand_type;
37  using TOutput = typename strategy::return_type;
38 
39  const PoolingArgs m_args; // Copy of arguments
40  const Requantize32 m_requant; // Quantization parameters
41 
42  unsigned int input_rows(void) const
43  {
44  return m_args.pool_window.rows;
45  }
46 
47  unsigned int input_cols(void) const
48  {
49  return m_args.pool_window.cols;
50  }
51 
52  public:
53  PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq) : m_args(args), m_requant(rq)
54  {
55  }
56 
59 
60  size_t sizeof_input_pointer_array(void) const
61  {
62  return sizeof(TInput *) * input_rows() * input_cols();
63  }
64 
65  size_t get_working_size(unsigned int num_threads) const override
66  {
67  return num_threads * sizeof_input_pointer_array();
68  }
69 
70  void execute(
71  const void *const input,
72  void *const output,
73  void *const working_space,
74  unsigned int thread_id,
75  unsigned int num_threads
76  ) const override
77  {
78  const size_t ld_input_col = m_args.n_channels;
79  const size_t ld_input_row = ld_input_col * m_args.input_cols;
80  const size_t ld_input_batch = ld_input_row * m_args.input_rows;
81  const size_t ld_output_col = ld_input_col;
82  const size_t ld_output_row = ld_output_col * m_args.output_cols;
83  const size_t ld_output_batch = ld_output_row * m_args.output_rows;
84 
85  execute(
86  input, ld_input_col, ld_input_row, ld_input_batch,
87  output, ld_output_col, ld_output_row, ld_output_batch,
88  working_space,
89  thread_id, num_threads
90  );
91  }
92 
93  void execute(
94  const void *const input,
95  size_t ld_input_col,
96  size_t ld_input_row,
97  size_t ld_input_batch,
98  void *const output,
99  size_t ld_output_col,
100  size_t ld_output_row,
101  size_t ld_output_batch,
102  void *const working_space,
103  unsigned int thread_id,
104  unsigned int num_threads
105  ) const override
106  {
107  execute(
108  m_args.n_batches, m_args.input_rows, m_args.input_cols,
109  m_args.n_channels,
110  input, ld_input_col, ld_input_row, ld_input_batch,
111  m_args.padding,
112  m_args.output_rows, m_args.output_cols,
113  output, ld_output_col, ld_output_row, ld_output_batch,
114  working_space,
115  thread_id, num_threads
116  );
117  }
118 
119  void execute(
120  unsigned int batches,
121  unsigned int height,
122  unsigned int width,
123  unsigned int channels,
124  const void *const _input,
125  size_t ld_input_col,
126  size_t ld_input_row,
127  size_t ld_input_batch,
128  const PaddingValues &padding,
129  unsigned int output_height,
130  unsigned int output_width,
131  void *const _output,
132  size_t ld_output_col,
133  size_t ld_output_row,
134  size_t ld_output_batch,
135  void *const _working_space,
136  unsigned int thread_id,
137  unsigned int num_threads
138  ) const override
139  {
140  strategy strat(m_args.cpu_info);
141 #ifdef CYCLE_PROFILING
142  arm_gemm::profiler prof;
143 #endif // CYCLE_PROFILING
144 
145  const unsigned int roundup_output_rows = roundup(output_height, num_threads);
146  const unsigned int rows_per_thread = roundup_output_rows / num_threads;
147  int start_out_height = static_cast<int>(thread_id * rows_per_thread);
148  int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
149 
150  unsigned int start_channel = 0;
151  unsigned int end_channel = channels;
152  if(output_height == 1)
153  {
154  const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
155  start_channel = thread_id * channels_per_thread;
156  end_channel = std::min(start_channel + channels_per_thread, channels);
157 
158  // Reset start and end rows
159  start_out_height = 0;
160  end_out_height = output_height;
161  }
162 
163  if(start_channel >= end_channel)
164  {
165  // Early exit in case of multiple threads parallelising on channels
166  return;
167  }
168 
169  // Cast input and output pointers into the right types
170  const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
171  TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
172 
173  // Grab the input pointer array
174  uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
175  const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
176 
177  // For each output tile, construct the requisite set of pointers and call
178  // into the kernel.
179  for (unsigned int batch = 0; batch < batches; batch++)
180  {
181  // Get batch pointers
182  const auto inptr_batch = inptr + batch * ld_input_batch;
183  const auto outptr_batch = outptr + batch * ld_output_batch;
184 
185  for (int out_i = start_out_height; out_i < end_out_height; out_i++)
186  {
187  const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
188  const int end_in_i = start_in_i + m_args.pool_window.rows;
189 
190  // Compute top/bottom padding
191  const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
192  const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
193 
194  // Compute the number of pooling window rows which are contained in
195  // either the valid region of the input tensor, or the padding.
196  const auto padded_bottom = std::min<unsigned int>(
197  start_in_i + m_args.pool_window.rows, height + padding.bottom
198  );
199  const auto n_total_rows = padded_bottom - start_in_i;
200 
201  for (int out_j = 0, start_in_j = -padding.left;
202  out_j < static_cast<int>(output_width);
203  out_j++, start_in_j += m_args.pool_stride.cols)
204  {
205  const int end_in_j = start_in_j + m_args.pool_window.cols;
206 
207  // Compute left/right padding
208  const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
209  const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
210 
211  // Compute the number of pooling window columns which are contained
212  // in either the valid region of the input tensor, or the padding.
213  const auto padded_right = std::min<unsigned int>(
214  start_in_j + m_args.pool_window.cols, width + padding.right
215  );
216  const auto n_total_cols = padded_right - start_in_j;
217 
218  // Construct the input pointer array - fill in all valid points
219  // contiguously.
220  const TInput **ptrs = inptr_array;
221  for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
222  {
223  // Can skip over the left padding because we will have either the
224  // same or less than the previous tile.
225  unsigned int j = pad_left;
226  const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
227  for (; j < input_cols() - pad_right; j++)
228  {
229  *(ptrs++) = colptr;
230  colptr += ld_input_col;
231  }
232  }
233 
234  // Compute the number of valid cells
235  const auto valid_rows = input_rows() - pad_top - pad_bottom;
236  const auto valid_cols = input_cols() - pad_left - pad_right;
237  const auto valid_cells = valid_rows * valid_cols;
238  const auto cells_in_range = n_total_rows * n_total_cols;
239  const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
240 
241  // Get the output pointer for this call
242  TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
243 
244 #ifdef CYCLE_PROFILING
245  // TODO Work number
246  auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
247 #endif
248  strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
249  }
250  }
251  }
252  }
253 };
254 
255 } // namespace pooling
256 } // namespace arm_conv
T roundup(const T a, const T b)
Definition: utils.hpp:70
size_t get_working_size(unsigned int num_threads) const override
PoolingDepthfirstGenericQuantized(const PoolingArgs &args, const Requantize32 &rq)
template UniquePoolingCommon< float, float > pooling(const PoolingArgs &, const Nothing &)
void execute(const void *const input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, void *const output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const working_space, unsigned int thread_id, unsigned int num_threads) const override
void execute(const void *const input, void *const output, void *const working_space, unsigned int thread_id, unsigned int num_threads) const override
void execute(unsigned int batches, unsigned int height, unsigned int width, unsigned int channels, const void *const _input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const PaddingValues &padding, unsigned int output_height, unsigned int output_width, void *const _output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const _working_space, unsigned int thread_id, unsigned int num_threads) const override
const StratType * strategy
PoolingDepthfirstGenericQuantized & operator=(PoolingDepthfirstGenericQuantized &)=delete
unsigned int batches