Compute Library
 22.08
depthwise_planar.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "depthfirst_driver.hpp"
26 #include "interleaves/generic.hpp"
27 
28 namespace arm_conv {
29 namespace depthwise {
30 
31 template <typename OutputStage>
33 {
34  public:
35  virtual ~IPlanarStrategy() = default;
36  virtual unsigned int get_output_rows(void) const = 0;
37  virtual arm_gemm::VLType get_vl_type(void) const = 0;
38 
39  virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
40  virtual void pack_parameters(
41  const DepthwiseArgs &args, void *buffer,
42  const void *biases, const OutputStage &,
43  const void *weights, size_t ld_weight_col, size_t ld_weight_row
44  ) const = 0;
45 };
46 
47 
48 template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
49  typename OutputStage>
51 
52 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
53 struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
54 {
55  using Type = std::function<void(
56  const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
57  unsigned int pad_top, unsigned int valid_input_rows,
58  unsigned int pad_left, unsigned int valid_input_cols,
59  const TWeight *, const TAccum *,
60  TOutput **, const size_t *, const size_t *, unsigned int output_cols,
61  unsigned int start_channels, unsigned int valid_channels,
62  TAccum act_min, TAccum act_max
63  )>;
64 
65  template <typename WorkspaceType>
66  static inline void execute(
67  const Type fn,
68  const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
69  unsigned int pad_top, unsigned int valid_input_rows,
70  unsigned int pad_left, unsigned int valid_input_cols,
71  const TWeight *weights, const TAccum *bias,
72  TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
73  unsigned int start_channel, unsigned int valid_channels,
74  const Nothing &, const WorkspaceType *ws
75  )
76  {
77  fn(
78  inptr, ld_in_row, ld_in_col, ld_in_vl,
79  pad_top, valid_input_rows,
80  pad_left, valid_input_cols,
81  weights, bias,
82  outptrs, outlds, outvllds, output_cols,
83  start_channel, valid_channels,
84  ws->activation_min, ws->activation_max
85  );
86  }
87 };
88 
89 template <typename TInput, typename TWeight, typename TOutput>
90 struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
91 {
92  using Type = std::function<void(
93  const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
94  unsigned int pad_top, unsigned int valid_input_rows,
95  unsigned int pad_left, unsigned int valid_input_cols,
96  const TWeight *,
97  TOutput **, const size_t *, const size_t *, unsigned int output_cols,
98  unsigned int start_channel, unsigned int valid_channels,
100  )>;
101 
102  template <typename WorkspaceType>
103  static inline void execute(
104  const Type fn,
105  const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
106  unsigned int pad_top, unsigned int valid_input_rows,
107  unsigned int pad_left, unsigned int valid_input_cols,
108  const TWeight *weights, const int32_t *,
109  TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
110  unsigned int first_channel, unsigned int valid_channels,
111  const arm_gemm::Requantize32 &qp, const WorkspaceType *
112  )
113  {
114  fn(
115  inptr, ld_in_row, ld_in_col, ld_in_vl,
116  pad_top, valid_input_rows,
117  pad_left, valid_input_cols,
118  weights,
119  outptrs, outlds, outldvls, output_cols,
120  first_channel, valid_channels,
121  qp
122  );
123  }
124 };
125 
126 
127 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
128  typename TAccum=typename DefaultTAccum<TOutput>::Type,
129  typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
130 class PlanarStrategy : public IPlanarStrategy<OutputStage>
131 {
132  unsigned int m_kernel_rows, m_kernel_cols;
133  unsigned int m_stride_rows, m_stride_cols;
134  unsigned int m_output_rows;
135  arm_gemm::VLType m_vl_type;
136 
137  protected:
138  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
139  {
140  // Get the kernel point to pack at the given index; return false to
141  // indicate that this index (and all greater indices) is out of range.
142  if (m_kernel_rows * m_kernel_cols <= index)
143  return false;
144 
145  y = index % m_kernel_cols;
146  x = index / m_kernel_cols;
147  return true;
148  }
149 
150  virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
151  {
153  m_kernel_rows, m_kernel_cols, sizeof(TWeight),
154  false, sizeof(TAccum), // Don't pack the bias
155  m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
156  [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
157  { return this->get_kernel_packing_point(idx, x, y); }
158  );
159  }
160 
161  public:
163  unsigned int kernel_rows, unsigned int kernel_cols,
164  unsigned int stride_rows, unsigned int stride_cols,
165  unsigned int output_rows,
166  arm_gemm::VLType vl_type
167  ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
168  m_stride_rows(stride_rows), m_stride_cols(stride_cols),
169  m_output_rows(output_rows), m_vl_type(vl_type)
170  {
171  }
172 
173  unsigned int get_output_rows(void) const override { return m_output_rows; }
174  arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
175 
176  size_t get_storage_size(const DepthwiseArgs &args) const override
177  {
178  return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
179  }
180 
182  const DepthwiseArgs &args, void *buffer,
183  const void *biases, const OutputStage &,
184  const void *weights, size_t ld_weight_col, size_t ld_weight_row
185  ) const override
186  {
188  this->get_kernel_packing_arguments(), args,
189  buffer, biases, weights, ld_weight_col, ld_weight_row
190  );
191  }
192 
194  virtual KernelType get_kernel(void) const = 0;
195 };
196 
197 
198 namespace {
199 
200 template <typename T>
201 struct OutputRowPtrsElement
202 {
203  struct Workspace
204  {
206  size_t *output_ld_cols;
207  size_t *output_ld_vls; // Stride between vectors of channels
209  };
210 
211  template <typename OutputStage>
212  static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
213  {
214  // We need one pointer and stride for each row of output, and an additional
215  // blob of memory into which padded stores can go.
216  return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
217  get_vector_length<char>(args.strategy->get_vl_type());
218  }
219 
220  template <typename WorkspaceType, typename OutputStage>
221  static void *initialise(WorkspaceType *ws, void *buffer,
222  const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
223  {
224  const auto n_rows = args.strategy->get_output_rows();
225  ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
226  ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
227  ws->output_ld_vls = ws->output_ld_cols + n_rows;
228  ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
229  return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
230  }
231 };
232 
233 } // namespace {anonymous}
234 
235 
236 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
237  typename TAccum=typename DefaultTAccum<TOutput>::Type,
238  typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
239 class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
240 {
241  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
243  using WorkspaceManager = Workspace<
244  OutputRowPtrsElement<TOutput>,
245  ActivationsElement<TAccum, OutputStage>
246  >;
247  using WorkspaceType = typename WorkspaceManager::WorkspaceType;
248 
249  std::unique_ptr<StrategyType> m_strat;
250  const TAccum *m_bias;
251  OutputStage m_os;
252 
253  public:
254  DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
255  : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
256  {
257  }
258 
259  size_t get_storage_size(void) const override
260  {
261  return m_strat->get_storage_size(this->m_args);
262  }
263 
265  void *buffer, const void *biases,
266  const void *weights, size_t ld_weight_col, size_t ld_weight_row
267  ) override
268  {
269  m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
270  this->m_bias = reinterpret_cast<const TAccum *>(biases);
271  depthwise_depthfirst::stash_bias(this->m_os, biases);
272  }
273 
274  size_t get_working_size(unsigned int n_threads, unsigned int) const override
275  {
276  return this->get_working_size_per_thread() * n_threads;
277  }
278 
279  protected:
280  /* Compute the amount of working space required for a single thread. */
281  virtual size_t get_working_size_per_thread(void) const
282  {
283  return WorkspaceManager::get_sizeof_workspace(
284  WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
285  }
286 
287  /* Initialise the working space for a thread. */
288  virtual void initialise_working_space(void *buffer) const
289  {
290  WorkspaceManager::initialise(
291  buffer,
292  WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
293  );
294  }
295 
296  /* Execute the kernel for a given chunk of work. */
297  virtual void execute_kernel(
298  const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
299  unsigned int pad_top, unsigned int valid_input_rows,
300  unsigned int pad_left, unsigned int valid_input_cols,
301  const TWeight *weights, const TAccum *bias,
302  TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
303  unsigned int valid_output_rows, unsigned int valid_output_cols,
304  unsigned int first_channel, unsigned int valid_channels,
305  WorkspaceType *ws
306  ) const
307  {
308  // Initialise the output pointers
309  for (auto i = 0u; i < m_strat->get_output_rows(); i++)
310  {
311  // Point at the output tensor for all valid rows; otherwise point at the
312  // padding buffer.
313  ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
314  ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
315  ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
316  outptr += ld_out_row;
317  }
318 
319  // Execute the kernel
321  reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
322  inptr, ld_in_row, ld_in_col, ld_in_vl,
323  pad_top, valid_input_rows, pad_left, valid_input_cols,
324  weights, bias,
325  ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
326  valid_output_cols, first_channel, valid_channels,
327  this->m_os, ws
328  );
329  }
330 
331  void execute_internal(
332  unsigned int batches,
333  unsigned int input_height,
334  unsigned int input_width,
335  unsigned int n_input_channels,
336  const PaddingValues &padding,
337  const void *input,
338  size_t ld_input_col,
339  size_t ld_input_row,
340  size_t ld_input_batch,
341  const void *parameters,
342  unsigned int output_height,
343  unsigned int output_width,
344  void *output,
345  size_t ld_output_col,
346  size_t ld_output_row,
347  size_t ld_output_batch,
348  void *working_space,
349  unsigned int thread_id,
350  unsigned int n_threads
351  ) const override
352  {
353  // Get and initialise the working space for this thread.
354  void *thread_working_space =
355  static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
356  this->initialise_working_space(thread_working_space);
357  auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
358 
359  const auto n_output_channels = n_input_channels * this->m_args.channel_multiplier;
360  const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
361 
362  // Get typed pointers
363  auto input_batch = reinterpret_cast<const TInput *>(input);
364  auto output_batch = reinterpret_cast<TOutput *>(output);
365  auto weights = reinterpret_cast<const TWeight *>(parameters);
366 
367  // Iterate over batches
368  for (; batches; batches--)
369  {
370  // NOTE: Other loop orderings are possible and it would be worth
371  // investigating them.
372 
373  // Within a batch, stripe threads across rows.
374  for (auto start_output_i = thread_id * m_strat->get_output_rows();
375  start_output_i < output_height;
376  start_output_i += n_threads * m_strat->get_output_rows())
377  {
378  // Determine what (if any padding) is required on the top/bottom of
379  // this row of the convolution.
380  const int start_input_i = start_output_i * this->m_args.stride_rows - padding.top;
381  const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
382  const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
383  const unsigned int valid_input_rows = input_i > input_height ? 0 : input_height - input_i;
384  const unsigned int valid_output_rows = output_height - start_output_i;
385 
386  auto inptr_row = input_batch + input_i*ld_input_row;
387  auto outptr_row = output_batch + start_output_i * ld_output_row;
388 
389  // Execute the kernel
390  this->execute_kernel(
391  inptr_row, ld_input_row, ld_input_col, vl,
392  input_pad_top, valid_input_rows, padding.left, input_width,
393  weights, this->m_bias,
394  outptr_row, ld_output_row, ld_output_col, vl,
395  valid_output_rows, output_width,
396  0 /* first channel */, n_output_channels,
397  ws
398  );
399  }
400 
401  // Update the input and output pointers to account for batch
402  input_batch += ld_input_batch;
403  output_batch += ld_output_batch;
404  }
405  }
406 };
407 
408 } // namespace depthwise
409 } // namespace arm_conv
void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
size_t get_working_size(unsigned int n_threads, unsigned int) const override
arm_gemm::VLType get_vl_type(void) const override
static void execute(const Type fn, const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, unsigned int pad_top, unsigned int valid_input_rows, unsigned int pad_left, unsigned int valid_input_cols, const TWeight *weights, const int32_t *, TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols, unsigned int first_channel, unsigned int valid_channels, const arm_gemm::Requantize32 &qp, const WorkspaceType *)
size_t get_storage_size(void) const override
virtual size_t get_storage_size(const DepthwiseArgs &) const =0
const size_t input_height
Definition: impl.cpp:61
size_t get_storage_size(const DepthwiseArgs &args) const override
virtual arm_gemm::VLType get_vl_type(void) const =0
size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
Definition: generic.cpp:45
typename PlanarKernelType< TInput, TWeight, TOutput, TAccum, OutputStage >::Type KernelType
const size_t input_width
Definition: impl.cpp:62
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
size_t * output_ld_vls
static void execute(const Type fn, const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, unsigned int pad_top, unsigned int valid_input_rows, unsigned int pad_left, unsigned int valid_input_cols, const TWeight *weights, const TAccum *bias, TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols, unsigned int start_channel, unsigned int valid_channels, const Nothing &, const WorkspaceType *ws)
T * output_padding_buffer
std::unique_ptr< ParametersLibrary > parameters
Definition: Framework.cpp:46
std::function< void(const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, unsigned int pad_top, unsigned int valid_input_rows, unsigned int pad_left, unsigned int valid_input_cols, const TWeight *, const TAccum *, TOutput **, const size_t *, const size_t *, unsigned int output_cols, unsigned int start_channels, unsigned int valid_channels, TAccum act_min, TAccum act_max)> Type
size_t * output_ld_cols
DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
virtual void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const =0
T ** output_row_ptrs
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
unsigned int batches
unsigned int get_output_rows(void) const override
PlanarStrategy(unsigned int kernel_rows, unsigned int kernel_cols, unsigned int stride_rows, unsigned int stride_cols, unsigned int output_rows, arm_gemm::VLType vl_type)
std::function< void(const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, unsigned int pad_top, unsigned int valid_input_rows, unsigned int pad_left, unsigned int valid_input_cols, const TWeight *, TOutput **, const size_t *, const size_t *, unsigned int output_cols, unsigned int start_channel, unsigned int valid_channels, const arm_gemm::Requantize32 &)> Type
virtual unsigned int get_output_rows(void) const =0
void pack_parameters_generic(const PackingArguments &packing_args, const DepthwiseArgs &args, void *buffer_raw, const void *biases_raw, const void *weights_raw, size_t ld_weight_col, size_t ld_weight_row)
Definition: generic.cpp:67
const int32_t * bias