Compute Library
 22.08
depthfirst_driver.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "src/core/NEON/kernels/assembly/depthwise.hpp"
29 
30 namespace arm_conv {
31 namespace depthwise {
32 
33 template <typename T> struct DefaultTAccum { using Type = T; };
34 template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
35 template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
36 
37 template <typename T> struct DefaultOutputStage { using Type = Nothing; };
38 template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
39 template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
40 
42 {
43  public:
44  virtual ~IDepthfirstStrategy() = default;
45 
46  virtual unsigned int get_input_rows() const = 0;
47  virtual unsigned int get_input_cols() const = 0;
48 
49  virtual unsigned int get_output_rows() const = 0;
50  virtual unsigned int get_output_cols() const = 0;
51 };
52 
53 
54 template <typename T>
55 struct TensorSpec
56 {
57  T base;
58  size_t ld_row, ld_col;
59 
60  TensorSpec(T ptr, size_t ld_row, size_t ld_col)
61  : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
62 };
63 
64 
65 template <typename TInput, typename TWeight, typename TOutput>
66 class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
67 {
68  protected:
69  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
70 
71  // The strategy which we're applying to solve the depthwise convolution.
72  std::unique_ptr<const IDepthfirstStrategy> m_strat;
73 
74  /* Compute the amount of working space required for a single thread. */
75  virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0;
76 
77  /* Initialise the working space for a thread. */
78  virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0;
79 
80  /* Compute a portion of the output tensor with padding. */
81  virtual void compute_tile_padded(
82  unsigned int output_i, unsigned int output_j,
83  unsigned int output_channel_start, unsigned int output_channel_end,
85  const TensorSpec<TOutput *> &output,
86  const void *parameters,
87  void *working_space
88  ) const = 0;
89 
90  /* Compute a portion of the work with only top/bottom padding.
91  *
92  * The default implementation of this repeatedly calls into the padded tile
93  * variant.
94  */
95  virtual void compute_row_padded_tile_row(
96  const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
97  const unsigned int output_channel_start, const unsigned int output_channel_end,
98  const TensorSpec<const TInput *> &input,
99  const TensorSpec<TOutput *> &output,
100  const void *parameters,
101  void *working_space
102  ) const
103  {
104  for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
105  {
106  this->compute_tile_padded(
107  output_i, output_j, output_channel_start, output_channel_end,
108  input, output, parameters, working_space
109  );
110  }
111  }
112 
113  /* Compute a portion of the output tensor with no padding.
114  *
115  * The default implementation of this repeatedly calls into the padded
116  * variant.
117  */
118  virtual void compute_tiles_unpadded(
119  unsigned int start_output_i, unsigned int start_output_j,
120  unsigned int n_tile_rows, unsigned int n_tile_cols,
121  unsigned int output_channel_start, unsigned int output_channel_end,
122  const TensorSpec<const TInput *> &input,
123  const TensorSpec<TOutput *> &output,
124  const void *parameters,
125  void *working_space
126  ) const
127  {
128  for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
129  {
130  unsigned int row_start_output_j = start_output_j;
131  for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
132  {
133  this->compute_tile_padded(
134  start_output_i, row_start_output_j,
135  output_channel_start, output_channel_end,
136  input, output, parameters, working_space
137  );
138  row_start_output_j += m_strat->get_output_cols();
139  }
140  start_output_i += m_strat->get_output_rows();
141  }
142  }
143 
144  void execute_internal(
145  unsigned int n_batches,
146  unsigned int input_height,
147  unsigned int input_width,
148  unsigned int n_input_channels,
149  const PaddingValues &padding,
150  const void *input,
151  size_t ld_input_col,
152  size_t ld_input_row,
153  size_t ld_input_batch,
154  const void *parameters,
155  unsigned int output_height,
156  unsigned int output_width,
157  void *output,
158  size_t ld_output_col,
159  size_t ld_output_row,
160  size_t ld_output_batch,
161  void *working_space,
162  unsigned int thread_id,
163  unsigned int n_threads
164  ) const override
165  {
166  // Get and initialise the working space for this thread.
167  void *thread_working_space =
168  static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(n_input_channels);
169  this->initialise_working_space(thread_working_space, n_input_channels);
170 
171  // Construct convenient representations of the input/output tensors.
172  TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
173  TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
174 
175  const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
176 
177  for (unsigned int batch = 0; batch < n_batches; batch++)
178  {
179  // Iterate over rows of the output tensor; we stripe over the tiles.
180  for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
181  start_output_i < output_height;
182  start_output_i += n_threads * m_strat->get_output_rows())
183  {
184  // Determine what (if any padding) is required on the top/bottom of
185  // this row of the convolution.
186  const auto end_output_i = start_output_i + m_strat->get_output_rows();
187  const bool pad_output_bottom = output_height < end_output_i;
188 
189  const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
190  const bool pad_input_top = start_input_i < 0;
191  const int end_input_i = start_input_i + m_strat->get_input_rows();
192  const bool pad_input_bottom = static_cast<int>(input_height) < end_input_i;
193  const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
194 
195  // Iterate over the columns of the output tensor; we attempt to grab as
196  // much as possible of the unpadded regions, so the loop structure is a
197  // bit odd.
198  unsigned int start_output_j = 0;
199  while (start_output_j < output_width)
200  {
201  const int start_in_j = start_output_j * this->m_args.stride_cols - padding.left;
202  const bool pad_input_left = start_in_j < 0;
203 
204  // Determine if we can process a number of unpadded tiles in one go.
205  int n_unpadded_tiles = 0;
206  if (!pad_input_left)
207  {
208  // Determine the maximum number of tiles we could handle.
209  n_unpadded_tiles = (output_width - start_output_j) / m_strat->get_output_cols();
210 
211  // Handle padding on the right hand edge
212  const int tile_stride = m_strat->get_output_cols() * this->m_args.stride_cols;
213  int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
214  int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
215 
216  while (n_unpadded_tiles > 0 &&
217  (static_cast<int>(output_width) < end_output_j ||
218  static_cast<int>(input_width) < end_input_j))
219  {
220  n_unpadded_tiles--;
221  end_output_j -= m_strat->get_output_cols();
222  end_input_j -= tile_stride;
223  }
224  }
225 
226  // Process unpadded tiles, if possible, otherwise process a padded tile.
227  if (n_unpadded_tiles)
228  {
229  if (!pad_row)
230  {
231  // Completely unpadded execution
232  this->compute_tiles_unpadded(
233  start_output_i, start_output_j,
234  1, n_unpadded_tiles, // Compute a row of unpadded tiles
235  0, n_output_channels, // Compute all channels
236  input_tensor, output_tensor, parameters, thread_working_space
237  );
238  }
239  else
240  {
241  // Top/bottom padding only
242  this->compute_row_padded_tile_row(
243  start_output_i, start_output_j, n_unpadded_tiles,
244  0, n_output_channels, // Compute all channels
245  input_tensor, output_tensor, parameters, thread_working_space
246  );
247  }
248  start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
249  }
250  else
251  {
252  this->compute_tile_padded(
253  start_output_i, start_output_j,
254  0, n_output_channels, // Compute all channels
255  input_tensor, output_tensor, parameters, thread_working_space
256  );
257  start_output_j += m_strat->get_output_cols();
258  }
259  }
260  }
261 
262  // Progress the pointers for the next batch.
263  input_tensor.base += ld_input_batch;
264  output_tensor.base += ld_output_batch;
265  }
266  }
267 
268  public:
270  : Parent(args), m_strat(strategy)
271  {
272  }
273 
274  size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const override final
275  {
276  return n_threads * this->get_working_size_per_thread(n_input_channels);
277  }
278 };
279 
280 } // namespace depthwise
281 } // namespace arm_conv
const size_t input_height
Definition: impl.cpp:61
const size_t input_width
Definition: impl.cpp:62
size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const override final
std::unique_ptr< ParametersLibrary > parameters
Definition: Framework.cpp:46
DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
TensorSpec(T ptr, size_t ld_row, size_t ld_col)
const StratType * strategy
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)