Compute Library
 23.05
depthwise_depthfirst_multiplier.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #pragma once
26 
27 #include "depthwise_depthfirst.hpp"
29 
30 #ifdef CYCLE_PROFILING
31 #include "profiler.hpp"
32 #endif
33 
34 #include <limits>
35 
36 namespace arm_conv {
37 namespace depthwise {
38 
39 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
40 class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
41 {
43 
44  protected:
45  virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
46  {
48  args.kernel_rows, args.kernel_cols, sizeof(TWeight),
49  true, sizeof(TAccum),
50  this->get_vl_type(),
51  sizeof(TAccum), 1,
52  [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
53  {
54  if (pos < args.kernel_rows * args.kernel_cols)
55  {
56  y = pos % args.kernel_cols;
57  x = pos / args.kernel_cols;
58  return true;
59  }
60  return false;
61  }
62  );
63  }
64 
65  public:
66  using Parent::Parent;
67 
68  size_t get_storage_size(const DepthwiseArgs &args) const override
69  {
70  return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
71  }
72 
73  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
74  {
76  this->get_packing_args(args), args,
77  buffer, biases, weights, ld_weight_col, ld_weight_row
78  );
79  }
80 
81  using KernelType = std::function<void(
82  const TInput *const *, // Input pointers
83  TOutput *const *, // Output pointers
84  const void *, // Ravelled bias, weights, and quantization parameters
85  unsigned int, // # output channels
86  TAccum, TAccum // Min and max activation clamps
87  )>;
88  virtual KernelType get_kernel(void) const = 0;
89 };
90 
91 
92 template <typename TInput, typename TWeight, typename TOutput>
93 class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
94 {
96 
97  public:
98  using Parent::Parent;
99 
100  size_t get_storage_size(const DepthwiseArgs &args) const override
101  {
103  }
104 
105  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
106  {
107  interleaves::quantized::pack_parameters<TWeight>(
108  buffer, reinterpret_cast<const int32_t *>(biases),
109  reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
110  args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
111  );
112  }
113 
114  using KernelType = std::function<void(
115  const TInput *const *, // Input pointers
116  TOutput *const *, // Output pointers
117  const void *, // Ravelled bias, weights, and quantization parameters
118  unsigned int, // # output channels
119  const arm_gemm::Requantize32 &
120  )>;
121  virtual KernelType get_kernel(void) const = 0;
122 };
123 
124 
125 template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
127 {
128  const arm_gemm::VLType m_vl_type;
129  const unsigned int m_output_rows, m_output_cols;
130 
131  public:
132  GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
133  : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
134  {
135  }
136 
137  virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
138 
139  arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
140  unsigned int get_output_rows(void) const { return m_output_rows; }
141  unsigned int get_output_cols(void) const { return m_output_cols; }
142 
143  using KernelType = std::function<void(
144  const TInput *const *, // Input pointers
145  TOutput *const *, // Output pointers
146  const TWeight *, // Ravelled weight parameters
147  const TAccum *, // Bias,
148  unsigned int, unsigned int, // Number of kernel points, number of output channels
149  TAccum, TAccum // Activation minimum and maximum
150  )>;
151  virtual KernelType get_kernel(void) const = 0;
152 };
153 
154 template <typename TInput, typename TWeight, typename TOutput>
155 class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
156 {
157  const arm_gemm::VLType m_vl_type;
158  const unsigned int m_output_rows, m_output_cols;
159 
160  public:
161  GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
162  : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
163  {
164  }
165 
166  virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
167 
168  arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
169  unsigned int get_output_rows(void) const { return m_output_rows; }
170  unsigned int get_output_cols(void) const { return m_output_cols; }
171 
172  using KernelType = std::function<void(
173  const TInput *const *, // Input pointers
174  TOutput *const *, // Output pointers
175  const TWeight *, // Ravelled weight parameters
176  const int32_t *, // Bias,
177  unsigned int, unsigned int, // Number of kernel points, number of output channels
178  const int32_t *, const int32_t *, const int32_t *, // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
179  const arm_gemm::Requantize32 &
180  )>;
181  virtual KernelType get_kernel(void) const = 0;
182 };
183 
184 template <typename TInput,
185  typename TWeight=TInput,
186  typename TOutput=TInput,
187  typename TAccum=typename DefaultTAccum<TInput>::Type,
188  typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
189 class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
190 {
192  std::unique_ptr<KernelStrategyType> m_kern;
193 
194  protected:
195  virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
196  {
198  args.kernel_rows, args.kernel_cols, sizeof(TWeight),
199  false, sizeof(TAccum),
200  this->get_vl_type(),
201  sizeof(TAccum), 1,
202  [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
203  {
204  if (pos < args.kernel_rows * args.kernel_cols)
205  {
206  y = pos % args.kernel_cols;
207  x = pos / args.kernel_cols;
208  return true;
209  }
210  return false;
211  }
212  );
213  }
214 
215  public:
216  GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
217  : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
218  kern->get_output_rows(), kern->get_output_cols(),
219  args.kernel_rows, args.kernel_cols,
220  args.stride_rows, args.stride_cols
221  ),
222  m_kern(kern)
223  {
224  };
225 
226  arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
227  const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
228 
229  size_t get_storage_size(const DepthwiseArgs &args) const override
230  {
231  return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
232  }
233 
234  void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
235  {
237  this->get_packing_args(args), args,
238  buffer, biases, weights, ld_weight_col, ld_weight_row
239  );
240  }
241 };
242 
243 // Specialise elements of the wrapper based on the type of kernel.
244 namespace depthfirst_multiplier {
245 
246 /* Working space element which contains a pointer for each row of input, a row
247  * of padding, and a space which can be used to construct an NCHW-ordered patch
248  * of input.
249  */
250 template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
252 {
253  public:
254  struct Workspace
255  {
256  constexpr static bool InputPatchIsGeneric = IsGeneric;
257  const T **input_rows;
260  };
261 
262  static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
263  {
264  return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
265  }
266 
267  template <class WorkspaceType>
268  static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
269  {
270  auto buffer_bytes = reinterpret_cast<char *>(buffer);
271 
272  ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
273  buffer_bytes += sizeof_input_rows(args);
274 
275  ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
276  buffer_bytes += sizeof_input_padding(args);
277 
278  ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
279  buffer_bytes += sizeof_input_patch(args);
280 
281  // Initialise the padding
282  memset(ws->input_padding,
283  get_input_buffer_fill_value(args.output_stage),
284  sizeof_input_padding(args));
285 
286  return buffer_bytes;
287  }
288 
289  protected:
290  static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
291  {
292  if (IsGeneric)
293  {
294  return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
295  }
296  else
297  {
298  return sizeof(T *) * args.strategy->get_input_rows();
299  }
300  }
301 
302  static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
303  {
304  // Round-up the number of columns to be a whole number of QUADS
305  auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
306  return sizeof(T) * input_cols;
307  }
308 
309  static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
310  {
311  if (IsGeneric)
312  {
313  // Round-up the number of columns to be a whole number of QUADS
314  auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
315  const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
316  return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
317  }
318  else
319  {
320  // Round-up the number of columns to be a whole number of QUADS
321  auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
322  return sizeof(T) * args.strategy->get_input_rows() * input_cols;
323  }
324  }
325 };
326 
327 template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
329 {
331 
332  template <typename WorkspaceType>
333  static void execute(
334  const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
335  const OutputStage &, const unsigned int,
336  const void *parameters, const void *
337  )
338  {
339  strat->get_kernel()(
340  ws->input_rows,
341  ws->outptr_array,
342  parameters, args.channel_multiplier,
343  ws->activation_min, ws->activation_max
344  );
345  }
346 };
347 
348 template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
349 struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
350 {
352 
353  template <typename WorkspaceType>
354  static void execute(
355  const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
356  const OutputStage &, const unsigned int start_output_channel,
357  const void *parameters, const void *bias
358  )
359  {
360  strat->get_kernel()(
361  ws->input_rows, ws->outptr_array,
362  reinterpret_cast<const TWeight *>(parameters),
363  bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
364  strat->get_kernel_rows() * strat->get_kernel_cols(),
365  args.channel_multiplier,
366  ws->activation_min, ws->activation_max
367  );
368  }
369 };
370 
371 template <typename TInput, typename TWeight, typename TOutput>
372 struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
373 {
375 
376  template <typename WorkspaceType>
377  static void execute(
378  const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
379  const arm_gemm::Requantize32 &qp, const unsigned int,
380  const void *parameters, const void *
381  )
382  {
383  strat->get_kernel()(
384  ws->input_rows,
385  ws->outptr_array,
386  parameters, args.channel_multiplier,
387  qp
388  );
389  }
390 };
391 
392 template <typename TInput, typename TWeight, typename TOutput>
393 struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
394 {
396 
397  template <typename WorkspaceType>
398  static void execute(
399  const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
400  const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
401  const void *parameters, const void *
402  )
403  {
404  auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
405  {
406  return ptr == nullptr ? nullptr : ptr + start_output_channel;
407  };
408 
409  strat->get_kernel()(
410  ws->input_rows, ws->outptr_array,
411  reinterpret_cast<const TWeight *>(parameters),
412  get_ptr(qp.bias),
413  strat->get_kernel_rows() * strat->get_kernel_cols(),
414  args.channel_multiplier,
415  get_ptr(qp.per_channel_left_shifts),
416  get_ptr(qp.per_channel_muls),
417  get_ptr(qp.per_channel_right_shifts),
418  qp
419  );
420  }
421 };
422 
423 template <bool IsGeneric> struct PrepareInputSample;
424 
425 template <> struct PrepareInputSample<false>
426 {
427  template <typename WorkspaceType, typename StrategyType, typename T>
428  static void execute(
429  const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
430  T *base_ptr, size_t ld_row, size_t ld_col,
431  const unsigned int input_pad_top, const unsigned int valid_rows,
432  const unsigned int input_pad_left, const unsigned int valid_cols
433  )
434  {
436  ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
437  base_ptr, ld_row, ld_col,
438  ws->input_padding,
439  input_pad_top, valid_rows,
440  input_pad_left, valid_cols
441  );
442  }
443 };
444 
445 template <> struct PrepareInputSample<true>
446 {
447  template <typename WorkspaceType, typename StrategyType, typename T>
448  static void execute(
449  const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
450  T *base_ptr, size_t ld_row, size_t ld_col,
451  const unsigned int input_pad_top, const unsigned int valid_rows,
452  const unsigned int input_pad_left, const unsigned int valid_cols
453  )
454  {
456  ws->input_rows, ws->input_patch,
457  strat->get_output_rows(), strat->get_output_cols(),
458  args.kernel_rows, args.kernel_cols,
459  args.stride_rows, args.stride_cols,
460  base_ptr, ld_row, ld_col,
461  ws->input_padding,
462  input_pad_top, valid_rows,
463  input_pad_left, valid_cols
464  );
465  }
466 };
467 
468 } // namespace depthfirst_multiplier
469 
470 template <typename TInput,
471  typename TWeight=TInput,
472  typename TOutput=TInput,
473  typename TAccum=typename DefaultTAccum<TInput>::Type,
474  bool is_generic=false,
475  typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
476 class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
477 {
478  protected:
480  using WorkspaceManager = Workspace<
481  OutputArrayElement<TOutput>,
483  ActivationsElement<TOutput, OutputStage>
484  >;
485  using WorkingSpace = typename WorkspaceManager::WorkspaceType;
486 
487  OutputStage m_os; // Copy of the output parameters
488  const void *m_bias = nullptr; // Copy of the bias (should we need it)
489 
490  public:
491  DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
492  : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
493  {
494  }
495 
498 
499  size_t get_storage_size(void) const override
500  {
501  return reinterpret_cast<const StratType *>(this->m_strat.get())
502  ->get_storage_size(this->m_args);
503  }
504 
505  void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
506  {
507  reinterpret_cast<const StratType *>(this->m_strat.get())
508  ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
509  m_bias = biases;
510  depthwise_depthfirst::stash_bias(m_os, biases);
511  }
512 
513  size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
514  {
515  DepthwiseArgs args(this->m_args);
516  args.input_channels = n_input_channels;
517  return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
518  }
519 
520  void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
521  {
522  DepthwiseArgs args(this->m_args);
523  args.input_channels = n_input_channels;
524  return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
525  }
526 
528  const DepthwiseArgs &args,
529  unsigned int output_i, unsigned int output_j,
530  unsigned int output_channel_start, unsigned int output_channel_end,
532  const TensorSpec<TOutput *> &output,
533  const void *parameters,
534  void *working_space_raw
535  ) const override
536  {
537  // Get the working space
538  auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
539 
540  const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
541  const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
542  const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
543 
544  const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
545  const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
546  const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
547 
548  // Compute the output pointer array. We'll update this array after every
549  // invocation of the kernel.
551  ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
552  output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
553  output.ld_row, output.ld_col,
554  ws->output_buffer,
555  0, args.output_rows - output_i, // Top padding, # valid rows
556  0, args.output_cols - output_j // Left padding, # valid columns
557  );
558 
559  // Compute the parameter stride
560  DepthwiseArgs single_iter(args);
561  single_iter.input_channels = 1;
562  const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
563  ->get_storage_size(single_iter);
564 
565  for (; output_channel_start < output_channel_end;
566  output_channel_start += args.channel_multiplier)
567  {
568  // Compute the input pointer array
569  const auto input_channel = output_channel_start / args.channel_multiplier;
570 
571  // Construct the input patch
573  args, ws, this->m_strat.get(),
574  input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
575  input_pad_top, args.input_rows - input_i,
576  input_pad_left, args.input_cols - input_j
577  );
578 
579  // Execute the kernel
581  args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
582  parameters, m_bias
583  );
584 
585  // Update the output pointers
586  for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
587  {
588  ws->outptr_array[n] += args.channel_multiplier;
589  }
590 
591  // Progress the parameters
592  parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
593  }
594  }
595 };
596 
597 } // namespace depthwise
598 } // namespace arm_conv
static void execute(const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel, const void *parameters, const void *)
const int32_t * bias
Definition: arm_gemm.hpp:172
std::function< void(const TInput *const *, TOutput *const *, const void *, unsigned int, const arm_gemm::Requantize32 &)> KernelType
const int32_t * per_channel_left_shifts
Definition: arm_gemm.hpp:181
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
void fill_nchw_patch_array(size_t element_size, const void **dest_row_pointers_raw, void *dest_patch_raw, const unsigned int patch_rows, unsigned int patch_cols, const void *src_ptr_raw, size_t ld_row, size_t ld_col, const void *pad_row, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
Definition: addressing.cpp:170
size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
Definition: generic.cpp:45
void fill_patch_array_generic_kernel(size_t element_size, const void **dest_pointers_raw, void *patch_raw, const unsigned int output_rows, const unsigned int output_cols, const unsigned int kernel_rows, const unsigned int kernel_cols, const unsigned int stride_rows, const unsigned int stride_cols, const void *src_ptr_raw, size_t ld_row, size_t ld_col, const void *pad_row, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
Definition: addressing.cpp:258
static void execute(const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, const OutputStage &, const unsigned int, const void *parameters, const void *)
static void execute(const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, const arm_gemm::Requantize32 &qp, const unsigned int, const void *parameters, const void *)
GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
std::function< void(const TInput *const *, TOutput *const *, const TWeight *, const TAccum *, unsigned int, unsigned int, TAccum, TAccum)> KernelType
static void execute(const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat, T *base_ptr, size_t ld_row, size_t ld_col, const unsigned int input_pad_top, const unsigned int valid_rows, const unsigned int input_pad_left, const unsigned int valid_cols)
GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
std::unique_ptr< ParametersLibrary > parameters
Definition: Framework.cpp:46
virtual arm_gemm::VLType get_vl_type() const =0
size_t get_storage_size(const DepthwiseArgs &args, const arm_gemm::VLType vl_type, const unsigned int accumulator_depth_vl)
size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
static void * initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs< IDepthfirstStrategy, OutputStage > &args)
static void execute(const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, const OutputStage &, const unsigned int start_output_channel, const void *parameters, const void *bias)
const int32_t * per_channel_right_shifts
Definition: arm_gemm.hpp:182
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
void fill_pointer_array(size_t element_size, void **dest_raw, const unsigned int array_rows, const unsigned int array_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
Definition: addressing.cpp:32
std::function< void(const TInput *const *, TOutput *const *, const TWeight *, const int32_t *, unsigned int, unsigned int, const int32_t *, const int32_t *, const int32_t *, const arm_gemm::Requantize32 &)> KernelType
static void execute(const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat, T *base_ptr, size_t ld_row, size_t ld_col, const unsigned int input_pad_top, const unsigned int valid_rows, const unsigned int input_pad_left, const unsigned int valid_cols)
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
void compute_tile_padded(const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, unsigned int output_channel_start, unsigned int output_channel_end, const TensorSpec< const TInput *> &input, const TensorSpec< TOutput *> &output, const void *parameters, void *working_space_raw) const override
virtual KernelType get_kernel(void) const =0
GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
static size_t get_element_size(const WorkspaceArgs< IDepthfirstStrategy, OutputStage > &args)
std::function< void(const TInput *const *, TOutput *const *, const void *, unsigned int, TAccum, TAccum)> KernelType
const int32_t * per_channel_muls
Definition: arm_gemm.hpp:183
size_t get_storage_size(const DepthwiseArgs &args) const override
void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
void pack_parameters_generic(const PackingArguments &packing_args, const DepthwiseArgs &args, void *buffer_raw, const void *biases_raw, const void *weights_raw, size_t ld_weight_col, size_t ld_weight_row)
Definition: generic.cpp:67
size_t get_storage_size(const DepthwiseArgs &args) const override
const int32_t * bias