Compute Library
 23.11
depthwise_planar.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "depthfirst_driver.hpp"
28 #include "interleaves/generic.hpp"
29 
30 namespace arm_conv {
31 namespace depthwise {
32 
33 template <typename OutputStage>
35 {
36  public:
37  virtual ~IPlanarStrategy() = default;
38  virtual unsigned int get_output_rows(void) const = 0;
39  virtual arm_gemm::VLType get_vl_type(void) const = 0;
40 
41  virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
42  virtual void pack_parameters(
43  const DepthwiseArgs &args, void *buffer,
44  const void *biases, const OutputStage &,
45  const void *weights, size_t ld_weight_col, size_t ld_weight_row
46  ) const = 0;
47 };
48 
49 
50 template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
51  typename OutputStage>
53 
54 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
55 struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
56 {
57  typedef void (*Type)(
58  const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
59  unsigned int pad_top, unsigned int valid_input_rows,
60  unsigned int pad_left, unsigned int valid_input_cols,
61  const TWeight *, const TAccum *,
62  TOutput **, const size_t *, const size_t *, unsigned int output_cols,
63  unsigned int start_channels, unsigned int valid_channels,
64  TAccum act_min, TAccum act_max
65  );
66 
67  template <typename WorkspaceType>
68  static inline void execute(
69  const Type fn,
70  const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
71  unsigned int pad_top, unsigned int valid_input_rows,
72  unsigned int pad_left, unsigned int valid_input_cols,
73  const TWeight *weights, const TAccum *bias,
74  TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
75  unsigned int start_channel, unsigned int valid_channels,
76  const Nothing &, const WorkspaceType *ws
77  )
78  {
79  fn(
80  inptr, ld_in_row, ld_in_col, ld_in_vl,
81  pad_top, valid_input_rows,
82  pad_left, valid_input_cols,
83  weights, bias,
84  outptrs, outlds, outvllds, output_cols,
85  start_channel, valid_channels,
86  ws->activation_min, ws->activation_max
87  );
88  }
89 };
90 
91 template <typename TInput, typename TWeight, typename TOutput>
92 struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
93 {
94  typedef void (*Type)(
95  const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
96  unsigned int pad_top, unsigned int valid_input_rows,
97  unsigned int pad_left, unsigned int valid_input_cols,
98  const TWeight *,
99  TOutput **, const size_t *, const size_t *, unsigned int output_cols,
100  unsigned int start_channel, unsigned int valid_channels,
101  const arm_gemm::Requantize32 &
102  );
103 
104  template <typename WorkspaceType>
105  static inline void execute(
106  const Type fn,
107  const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
108  unsigned int pad_top, unsigned int valid_input_rows,
109  unsigned int pad_left, unsigned int valid_input_cols,
110  const TWeight *weights, const int32_t *,
111  TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
112  unsigned int first_channel, unsigned int valid_channels,
113  const arm_gemm::Requantize32 &qp, const WorkspaceType *
114  )
115  {
116  fn(
117  inptr, ld_in_row, ld_in_col, ld_in_vl,
118  pad_top, valid_input_rows,
119  pad_left, valid_input_cols,
120  weights,
121  outptrs, outlds, outldvls, output_cols,
122  first_channel, valid_channels,
123  qp
124  );
125  }
126 };
127 
128 
129 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
130  typename TAccum=typename DefaultTAccum<TOutput>::Type,
131  typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
132 class PlanarStrategy : public IPlanarStrategy<OutputStage>
133 {
134  unsigned int m_kernel_rows, m_kernel_cols;
135  unsigned int m_stride_rows, m_stride_cols;
136  unsigned int m_output_rows;
137  arm_gemm::VLType m_vl_type;
138 
139  protected:
140  virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
141  {
142  // Get the kernel point to pack at the given index; return false to
143  // indicate that this index (and all greater indices) is out of range.
144  if (m_kernel_rows * m_kernel_cols <= index)
145  return false;
146 
147  y = index % m_kernel_cols;
148  x = index / m_kernel_cols;
149  return true;
150  }
151 
152  virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
153  {
155  m_kernel_rows, m_kernel_cols, sizeof(TWeight),
156  false, sizeof(TAccum), true, // Don't pack the bias
157  m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
158  [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
159  { return this->get_kernel_packing_point(idx, x, y); }
160  );
161  }
162 
163  public:
165  unsigned int kernel_rows, unsigned int kernel_cols,
166  unsigned int stride_rows, unsigned int stride_cols,
167  unsigned int output_rows,
168  arm_gemm::VLType vl_type
169  ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
170  m_stride_rows(stride_rows), m_stride_cols(stride_cols),
171  m_output_rows(output_rows), m_vl_type(vl_type)
172  {
173  }
174 
175  unsigned int get_output_rows(void) const override { return m_output_rows; }
176  arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
177 
178  size_t get_storage_size(const DepthwiseArgs &args) const override
179  {
180  return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
181  }
182 
184  const DepthwiseArgs &args, void *buffer,
185  const void *biases, const OutputStage &,
186  const void *weights, size_t ld_weight_col, size_t ld_weight_row
187  ) const override
188  {
190  this->get_kernel_packing_arguments(), args,
191  buffer, biases, weights, ld_weight_col, ld_weight_row
192  );
193  }
194 
196  virtual KernelType get_kernel(void) const = 0;
197 };
198 
199 
200 namespace {
201 
202 template <typename T>
203 struct OutputRowPtrsElement
204 {
205  struct Workspace
206  {
208  size_t *output_ld_cols;
209  size_t *output_ld_vls; // Stride between vectors of channels
211  };
212 
213  template <typename OutputStage>
214  static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
215  {
216  // We need one pointer and stride for each row of output, and an additional
217  // blob of memory into which padded stores can go.
218  return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
219  get_vector_length<char>(args.strategy->get_vl_type());
220  }
221 
222  template <typename WorkspaceType, typename OutputStage>
223  static void *initialise(WorkspaceType *ws, void *buffer,
224  const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
225  {
226  const auto n_rows = args.strategy->get_output_rows();
227  ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
228  ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
229  ws->output_ld_vls = ws->output_ld_cols + n_rows;
230  ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
231  return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
232  }
233 };
234 
235 } // namespace {anonymous}
236 
237 
238 template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
239  typename TAccum=typename DefaultTAccum<TOutput>::Type,
240  typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
241 class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
242 {
243  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
245  using WorkspaceManager = Workspace<
246  OutputRowPtrsElement<TOutput>,
247  ActivationsElement<TAccum, OutputStage>
248  >;
249  using WorkspaceType = typename WorkspaceManager::WorkspaceType;
250 
251  std::unique_ptr<StrategyType> m_strat;
252  const TAccum *m_bias;
253  OutputStage m_os;
254 
255  public:
256  DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
257  : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
258  {
259  }
260 
261  DepthwisePlanar(DepthwisePlanar &) = delete;
263 
264  size_t get_storage_size(void) const override
265  {
266  return m_strat->get_storage_size(this->m_args);
267  }
268 
270  void *buffer, const void *biases,
271  const void *weights, size_t ld_weight_col, size_t ld_weight_row
272  ) override
273  {
274  m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
275  this->m_bias = reinterpret_cast<const TAccum *>(biases);
276  depthwise_depthfirst::stash_bias(this->m_os, biases);
277  }
278 
279  size_t get_working_size(unsigned int n_threads) const override
280  {
281  return this->get_working_size_per_thread() * n_threads;
282  }
283 
284  protected:
285  /* Compute the amount of working space required for a single thread. */
286  virtual size_t get_working_size_per_thread(void) const
287  {
288  return WorkspaceManager::get_sizeof_workspace(
289  WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
290  }
291 
292  /* Initialise the working space for a thread. */
293  virtual void initialise_working_space(void *buffer) const
294  {
295  WorkspaceManager::initialise(
296  buffer,
297  WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
298  );
299  }
300 
301  /* Execute the kernel for a given chunk of work. */
302  virtual void execute_kernel(
303  const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
304  unsigned int pad_top, unsigned int valid_input_rows,
305  unsigned int pad_left, unsigned int valid_input_cols,
306  const TWeight *weights, const TAccum *bias,
307  TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
308  unsigned int valid_output_rows, unsigned int valid_output_cols,
309  unsigned int first_channel, unsigned int valid_channels,
310  WorkspaceType *ws
311  ) const
312  {
313  // Initialise the output pointers
314  for (auto i = 0u; i < m_strat->get_output_rows(); i++)
315  {
316  // Point at the output tensor for all valid rows; otherwise point at the
317  // padding buffer.
318  ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
319  ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
320  ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
321  outptr += ld_out_row;
322  }
323 
324  // Execute the kernel
325  PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
326  reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
327  inptr, ld_in_row, ld_in_col, ld_in_vl,
328  pad_top, valid_input_rows, pad_left, valid_input_cols,
329  weights, bias,
330  ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
331  valid_output_cols, first_channel, valid_channels,
332  this->m_os, ws
333  );
334  }
335 
336  void execute_internal(
337  const DepthwiseArgs &args,
338  const void *input,
339  size_t ld_input_col,
340  size_t ld_input_row,
341  size_t ld_input_batch,
342  const void *parameters,
343  void *output,
344  size_t ld_output_col,
345  size_t ld_output_row,
346  size_t ld_output_batch,
347  void *working_space,
348  unsigned int thread_id,
349  unsigned int n_threads
350  ) const override
351  {
352  // Get and initialise the working space for this thread.
353  void *thread_working_space =
354  static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
355  this->initialise_working_space(thread_working_space);
356  auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
357 
358  const auto n_output_channels = args.input_channels * args.channel_multiplier;
359  const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
360 
361  // Get typed pointers
362  auto input_batch = reinterpret_cast<const TInput *>(input);
363  auto output_batch = reinterpret_cast<TOutput *>(output);
364  auto weights = reinterpret_cast<const TWeight *>(parameters);
365 
366  // Iterate over batches
367  for (auto batches = args.n_batches; batches; batches--)
368  {
369  // NOTE: Other loop orderings are possible and it would be worth
370  // investigating them.
371 
372  // Within a batch, stripe threads across rows.
373  for (auto start_output_i = thread_id * m_strat->get_output_rows();
374  start_output_i < args.output_rows;
375  start_output_i += n_threads * m_strat->get_output_rows())
376  {
377  // Determine what (if any padding) is required on the top/bottom of
378  // this row of the convolution.
379  const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
380  const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
381  const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
382  const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
383  const unsigned int valid_output_rows = args.output_rows - start_output_i;
384 
385  auto inptr_row = input_batch + input_i*ld_input_row;
386  auto outptr_row = output_batch + start_output_i * ld_output_row;
387 
388  // Execute the kernel
389  this->execute_kernel(
390  inptr_row, ld_input_row, ld_input_col, vl,
391  input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
392  weights, this->m_bias,
393  outptr_row, ld_output_row, ld_output_col, vl,
394  valid_output_rows, args.output_cols,
395  0 /* first channel */, n_output_channels,
396  ws
397  );
398  }
399 
400  // Update the input and output pointers to account for batch
401  input_batch += ld_input_batch;
402  output_batch += ld_output_batch;
403  }
404  }
405 };
406 
407 } // namespace depthwise
408 } // namespace arm_conv
depthfirst_driver.hpp
output_ld_cols
size_t * output_ld_cols
Definition: depthwise_planar.hpp:208
arm_conv::depthwise::PlanarStrategy::PlanarStrategy
PlanarStrategy(unsigned int kernel_rows, unsigned int kernel_cols, unsigned int stride_rows, unsigned int stride_cols, unsigned int output_rows, arm_gemm::VLType vl_type)
Definition: depthwise_planar.hpp:164
arm_conv::depthwise::interleaves::PackingArguments
Definition: generic.hpp:36
arm_conv::depthwise::PlanarStrategy::pack_parameters
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
Definition: depthwise_planar.hpp:183
GemmTuner.args
args
Definition: GemmTuner.py:679
arm_conv::depthwise::depthwise
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
arm_conv::depthwise::IPlanarStrategy::~IPlanarStrategy
virtual ~IPlanarStrategy()=default
arm_conv::depthwise::DepthwisePlanar::get_working_size
size_t get_working_size(unsigned int n_threads) const override
Definition: depthwise_planar.hpp:279
arm_conv::depthwise::PlanarKernelType< TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32 >::execute
static void execute(const Type fn, const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, unsigned int pad_top, unsigned int valid_input_rows, unsigned int pad_left, unsigned int valid_input_cols, const TWeight *weights, const int32_t *, TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols, unsigned int first_channel, unsigned int valid_channels, const arm_gemm::Requantize32 &qp, const WorkspaceType *)
Definition: depthwise_planar.hpp:105
arm_conv::depthwise::interleaves::pack_parameters_generic
void pack_parameters_generic(const PackingArguments &packing_args, const DepthwiseArgs &args, void *buffer_raw, const void *biases_raw, const void *weights_raw, size_t ld_weight_col, size_t ld_weight_row)
Definition: generic.cpp:67
output_row_ptrs
T ** output_row_ptrs
Definition: depthwise_planar.hpp:207
arm_conv::depthwise::PlanarStrategy< uint8_t, uint8_t >::KernelType
typename PlanarKernelType< uint8_t, uint8_t, uint8_t, typename DefaultTAccum< uint8_t >::Type, typename DefaultOutputStage< uint8_t >::Type >::Type KernelType
Definition: depthwise_planar.hpp:195
arm_conv::depthwise::IPlanarStrategy::get_output_rows
virtual unsigned int get_output_rows(void) const =0
arm_conv::depthwise::PlanarStrategy
Definition: depthwise_planar.hpp:132
arm_conv::depthwise::IPlanarStrategy::pack_parameters
virtual void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const =0
arm_conv::depthwise::interleaves::get_storage_size_generic
size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
Definition: generic.cpp:45
arm_conv::depthwise::PlanarStrategy::get_storage_size
size_t get_storage_size(const DepthwiseArgs &args) const override
Definition: depthwise_planar.hpp:178
batches
unsigned int batches
Definition: CpuGemmAssemblyDispatch.cpp:105
arm_conv::depthwise::DefaultOutputStage::Type
Nothing Type
Definition: depthfirst_driver.hpp:37
arm_conv::depthwise::DepthwisePlanar::operator=
DepthwisePlanar & operator=(DepthwisePlanar &)=delete
arm_conv::depthwise::DepthwisePlanar::DepthwisePlanar
DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
Definition: depthwise_planar.hpp:256
arm_conv::depthwise::IPlanarStrategy
Definition: depthwise_planar.hpp:34
arm_gemm
Definition: barrier.hpp:30
arm_conv::depthwise::PlanarStrategy::get_vl_type
arm_gemm::VLType get_vl_type(void) const override
Definition: depthwise_planar.hpp:176
arm_conv::depthwise::PlanarKernelType
Definition: depthwise_planar.hpp:52
bias
const int32_t * bias
Definition: working_space.hpp:322
arm_conv::depthwise::DefaultTAccum::Type
T Type
Definition: depthfirst_driver.hpp:33
arm_conv::depthwise::PlanarKernelType< TInput, TWeight, TOutput, TAccum, Nothing >::execute
static void execute(const Type fn, const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl, unsigned int pad_top, unsigned int valid_input_rows, unsigned int pad_left, unsigned int valid_input_cols, const TWeight *weights, const TAccum *bias, TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols, unsigned int start_channel, unsigned int valid_channels, const Nothing &, const WorkspaceType *ws)
Definition: depthwise_planar.hpp:68
arm_conv::depthwise::IPlanarStrategy::get_vl_type
virtual arm_gemm::VLType get_vl_type(void) const =0
arm_compute::test::parameters
std::unique_ptr< ParametersLibrary > parameters
Definition: Framework.cpp:46
arm_gemm::VLType
VLType
Definition: utils.hpp:80
output_padding_buffer
T * output_padding_buffer
Definition: depthwise_planar.hpp:210
arm_conv::depthwise::PlanarStrategy::get_kernel
virtual KernelType get_kernel(void) const =0
output_ld_vls
size_t * output_ld_vls
Definition: depthwise_planar.hpp:209
arm_conv::depthwise::DepthwisePlanar
Definition: depthwise_planar.hpp:241
arm_conv::depthwise::IPlanarStrategy::get_storage_size
virtual size_t get_storage_size(const DepthwiseArgs &) const =0
arm_conv::depthwise::PlanarStrategy::get_output_rows
unsigned int get_output_rows(void) const override
Definition: depthwise_planar.hpp:175
arm_conv
Definition: addressing.cpp:30
arm_gemm::Requantize32
Definition: arm_gemm.hpp:186
arm_compute::test::validation::input
auto input
Definition: LSTMLayerQuantized.cpp:486
arm_conv::depthwise::DepthwisePlanar::get_storage_size
size_t get_storage_size(void) const override
Definition: depthwise_planar.hpp:264
arm_conv::depthwise::DepthwisePlanar::pack_parameters
void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
Definition: depthwise_planar.hpp:269
generic.hpp