31 #ifdef CYCLE_PROFILING
32 #include "profiler.hpp"
40 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
46 unsigned int m_output_rows, m_output_cols;
47 unsigned int m_kernel_rows, m_kernel_cols;
48 unsigned int m_stride_rows, m_stride_cols;
52 unsigned int output_rows,
unsigned int output_cols,
53 unsigned int kernel_rows,
unsigned int kernel_cols,
54 unsigned int stride_rows=1,
unsigned int stride_cols=1
55 ) : m_output_rows(output_rows), m_output_cols(output_cols),
56 m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
57 m_stride_rows(stride_rows), m_stride_cols(stride_cols)
78 template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
87 const TInput *
const *input_ptrs,
88 TOutput *
const *output_ptrs,
90 unsigned int n_channels,
97 const unsigned int n_tile_rows,
const unsigned int n_tile_cols,
98 const TInput *inptr_base, int64_t ld_input_row, int64_t ld_input_col,
99 TOutput *outptr_base, int64_t ld_output_row, int64_t ld_output_col,
100 const void *params,
unsigned int n_channels,
107 template <
typename TInput,
typename TWeight,
typename TOutput>
120 [
this] (
unsigned int idx,
unsigned int &x,
unsigned int &y) ->
bool
126 using Parent::Parent;
130 const TInput *
const *,
134 const int32_t *,
const int32_t *,
145 const DepthwiseArgs &
args,
void *buffer,
147 const void *weights,
size_t ld_weight_col,
size_t ld_weight_row
151 get_packing_args(),
args, buffer, biases, weights, ld_weight_col, ld_weight_row);
155 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
typename OutputStage>
162 inline OutputStage &get_output_stage(
void) {
return m_os; }
163 inline const OutputStage &get_output_stage(
void)
const {
return m_os; }
165 bool uses_intermediate_array()
const
167 return this->m_args.channel_multiplier != 1 && this->uses_premultiply();
170 virtual void fill_inptr_array(
const DepthwiseArgs &
args,
173 const unsigned int input_i,
const unsigned int input_j,
174 const unsigned int input_pad_top,
const unsigned int input_pad_left)
const = 0;
176 void initialise_inptr_array(
const DepthwiseArgs &
args,
177 unsigned int output_channel_start,
unsigned int output_channel_end,
180 const unsigned int input_i,
const unsigned int input_j,
181 const unsigned int input_pad_top,
const unsigned int input_pad_left,
182 Tile<TInput> &multiplied_input
186 const auto input_channel_start = output_channel_start /
args.channel_multiplier;
188 const auto last_valid_row = std::min(input_pad_top +
args.input_rows - input_i, this->m_strat->get_input_rows());
189 const auto last_valid_col = std::min(input_pad_left +
args.input_cols - input_j, this->m_strat->get_input_cols());
191 const auto tile_rows = last_valid_row - input_pad_top;
192 const auto tile_cols = last_valid_col - input_pad_left;
194 const auto tile_channels = output_channel_end - output_channel_start;
197 if (this->uses_intermediate_array()) {
201 input_i, input_j,
args.channel_multiplier);
204 multiplied_input.array,
205 tile_cols * tile_channels, tile_channels
209 input.base + input_i*
input.ld_row + input_j*
input.ld_col + input_channel_start,
214 fill_inptr_array(
args,
234 return reinterpret_cast<const StratType *
>(this->m_strat.get())->
238 void pack_parameters(
void *buffer,
const void *biases,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
override
240 reinterpret_cast<const StratType *
>(this->m_strat.get())->
241 pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
245 namespace depthwise_depthfirst {
250 template <
typename T>
259 template <
class OutputStage>
262 return sizeof(T **) *
args.strategy->get_input_rows() *
args.strategy->get_input_cols();
265 template <
class WorkspaceType,
class OutputStage>
266 static void *
initialise(WorkspaceType *ws,
void *buffer,
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
268 ws->inptr_array =
reinterpret_cast<const T**
>(buffer);
273 template <
typename TAccum,
typename OutputStage,
bool IsDot=false>
276 using Element = ActivationsElement<TAccum, OutputStage>;
282 using Element = RequantizationParametersElement;
285 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
typename OutputStage>
290 template <
typename Strat,
typename Workspace>
291 static inline void indirect(
const Strat *strat,
const Workspace *ws,
const OutputStage &,
const void *params,
const TAccum *,
unsigned int n_channels)
293 strat->get_indirect_kernel()(
297 ws->activation_min, ws->activation_max
301 template <
typename Strat,
typename Workspace>
303 const Strat *strat,
const Workspace *ws,
const OutputStage &,
304 unsigned int n_tile_rows,
unsigned int n_tile_cols,
305 const TInput *inptr,
size_t ld_in_row,
size_t ld_in_col,
306 TOutput *outptr,
size_t ld_out_row,
size_t ld_out_col,
307 const void *params,
unsigned int n_channels
310 strat->get_direct_kernel()(
311 n_tile_rows, n_tile_cols,
312 inptr, ld_in_row, ld_in_col,
313 outptr, ld_out_row, ld_out_col,
314 params, n_channels, ws->activation_min, ws->activation_max
319 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
324 template <
typename Strat,
typename Workspace>
325 static inline void indirect(
const Strat *strat,
const Workspace *ws,
const arm_gemm::Requantize32 &qp,
const void *params,
const TAccum *,
unsigned int n_channels)
328 n_channels, ws->inptr_array,
329 reinterpret_cast<const TWeight *
>(params), ws->bias,
330 qp, ws->requant_muls, ws->requant_shifts,
335 template <
typename Strat,
typename Workspace>
338 unsigned int,
unsigned int,
339 const TInput *,
size_t,
size_t,
340 TOutput *,
size_t,
size_t,
341 const void *,
unsigned int
352 template <
typename OutputStage>
353 inline void stash_bias(OutputStage &,
const void *) {}
361 qp.
bias =
reinterpret_cast<const int32_t *
>(
bias);
368 template <
typename TInput,
369 typename TWeight=TInput,
370 typename TOutput=TInput,
378 using WorkspaceManager = Workspace<
379 OutputArrayElement<TOutput>,
381 InputBufferElement<TInput>,
382 IntermediateBufferElement<TInput>,
385 using WorkingSpace =
typename WorkspaceManager::WorkspaceType;
388 const TAccum *m_bias;
392 : Parent(strat,
args, os), m_bias(
nullptr)
399 void pack_parameters(
void *buffer,
const void *biases,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
override
402 this->m_args, buffer, biases, this->get_output_stage(),
403 weights, ld_weight_col, ld_weight_row
405 m_bias =
reinterpret_cast<const TAccum *
>(biases);
406 depthwise_depthfirst::stash_bias(this->get_output_stage(), biases);
411 DepthwiseArgs
args(this->m_args);
412 return WorkspaceManager::get_sizeof_workspace(
413 WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, this->get_output_stage())
419 DepthwiseArgs
args(this->m_args);
420 WorkspaceManager::initialise(
421 buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, this->get_output_stage())
428 return Invoker::supports_direct_kernel && this->uses_intermediate_array();
433 void fill_inptr_array(
const DepthwiseArgs &
args,
436 const unsigned int input_i,
const unsigned int input_j,
437 const unsigned int input_pad_top,
const unsigned int input_pad_left)
const override
439 fill_pointer_array<const TInput>(
440 inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
444 input_pad_top,
args.input_rows - input_i,
445 input_pad_left,
args.input_cols - input_j
449 void compute_tile_padded(
450 const DepthwiseArgs &
args,
451 unsigned int output_i,
unsigned int output_j,
452 unsigned int output_channel_start,
unsigned int output_channel_end,
453 const TensorSpec<const TInput *> &
input,
454 const TensorSpec<TOutput *> &output,
456 void *working_space_raw
460 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space_raw);
463 const int ii =
static_cast<int>(output_i *
args.stride_rows) -
args.padding.top;
464 const auto input_pad_top =
static_cast<unsigned int>(ii < 0 ? -ii : 0);
465 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
467 const int ij =
static_cast<int>(output_j *
args.stride_cols) -
args.padding.left;
468 const auto input_pad_left =
static_cast<unsigned int>(ij < 0 ? -ij : 0);
469 const auto input_j =
static_cast<unsigned int>(ij < 0 ? 0 : ij);
471 Tile<TInput> multiplied_input;
472 this->initialise_inptr_array(
args, output_channel_start, output_channel_end,
input,
473 ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
474 input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
478 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
479 output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
480 output.ld_row, output.ld_col,
482 0,
args.output_rows - output_i,
483 0,
args.output_cols - output_j
488 reinterpret_cast<const StratType *
>(this->m_strat.get()),
489 ws, this->get_output_stage(),
parameters, m_bias, output_channel_end - output_channel_start
493 void compute_row_padded_tile_row(
494 const DepthwiseArgs &
args,
495 const unsigned int output_i,
unsigned int output_j,
unsigned int n_tile_cols,
496 const unsigned int output_channel_start,
const unsigned int output_channel_end,
497 const TensorSpec<const TInput *> &
input,
498 const TensorSpec<TOutput *> &output,
503 using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
504 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space);
505 const auto strat =
reinterpret_cast<const StratType *
>(this->m_strat.get());
506 const auto os = this->get_output_stage();
509 const int ii =
static_cast<int>(output_i *
args.stride_rows) -
args.padding.top;
510 const auto input_pad_top =
static_cast<unsigned int>(ii < 0 ? -ii : 0);
512 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
513 auto input_j = output_j *
args.stride_cols -
args.padding.left;
517 const auto valid_input_rows = std::min(strat->
get_input_rows() - input_pad_top,
args.input_rows - input_i);
518 const auto valid_output_rows = std::min(strat->
get_output_rows(),
args.output_rows - output_i);
520 const auto input_point_stride =
input.ld_col * this->m_strat->get_output_cols() *
args.stride_cols;
521 const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
523 Tile<TInput> multiplied_input;
524 this->initialise_inptr_array(
args, output_channel_start, output_channel_end,
input,
525 ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
526 input_i, input_j, input_pad_top, 0, multiplied_input);
529 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
530 output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
531 output.ld_row, output.ld_col,
533 0,
args.output_rows - output_i,
534 0,
args.output_cols - output_j
537 for (; n_tile_cols; n_tile_cols--)
541 strat, ws, os,
parameters, m_bias, output_channel_end - output_channel_start
545 if (this->uses_intermediate_array()) {
546 input_j += input_point_stride /
input.ld_col;
547 multiplied_input.load_from(
input.base,
550 input_i, input_j,
args.channel_multiplier);
553 auto ptr = ws->inptr_array + strat->
get_input_cols() * input_pad_top;
554 for (
auto n = input_pad_top; n < (valid_input_rows + input_pad_top); n++)
558 *(ptr++) += input_point_stride;
565 auto ptr = ws->outptr_array;
566 for (
auto n = 0u; n < valid_output_rows * strat->
get_output_cols(); n++)
568 *(ptr++) += output_point_stride;
574 void compute_tiles_unpadded(
575 const DepthwiseArgs &
args,
576 unsigned int output_i,
const unsigned int output_j,
577 unsigned int n_tile_rows,
unsigned int n_tile_cols,
578 unsigned int output_channel_start,
unsigned int output_channel_end,
579 const TensorSpec<const TInput *> &
input,
580 const TensorSpec<TOutput *> &output,
582 void *working_space_raw
585 using Invoker = depthwise_depthfirst::Invoke<TInput, TWeight, TOutput, TAccum, OutputStage>;
586 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space_raw);
587 const auto strat =
reinterpret_cast<const StratType *
>(this->m_strat.get());
588 const auto os = this->get_output_stage();
590 if (Invoker::supports_direct_kernel)
592 PaddingValues tile_padding = {
593 args.kernel_cols / 2,
594 args.kernel_rows / 2,
595 args.kernel_cols / 2,
601 auto outptr = output.base + output_channel_start + output_i * output.ld_row + output_j * output.ld_col;
602 const int start_input_i = output_i *
args.stride_rows -
args.padding.top;
603 const int start_input_j = output_j *
args.stride_cols -
args.padding.left;
604 auto inptr =
input.base + output_channel_start + start_input_i *
input.ld_row + start_input_j *
input.ld_col;
606 auto ld_row =
input.ld_row;
607 auto ld_col =
input.ld_col;
609 const auto tile_rows = this->m_strat->get_output_rows() *
args.stride_rows * n_tile_rows + tile_padding.top + tile_padding.bottom;
610 const auto tile_cols = this->m_strat->get_output_cols() *
args.stride_cols * n_tile_cols + tile_padding.left + tile_padding.right;
611 const auto tile_channels = output_channel_end - output_channel_start;
613 Tile<TInput> multiplied_input;
614 if (this->uses_intermediate_array()) {
615 multiplied_input = Tile<TInput>(ws->intermediate_buffer, tile_rows, tile_cols, tile_channels);
616 multiplied_input.load_from(
input.base,
619 start_input_i, start_input_j,
args.channel_multiplier);
621 ld_row = tile_cols * tile_channels;
622 ld_col = tile_channels;
623 inptr = multiplied_input.array;
629 n_tile_rows, n_tile_cols,
630 inptr, ld_row, ld_col,
631 outptr, output.ld_row, output.ld_col,
632 parameters, output_channel_end - output_channel_start
640 const auto n_input_pointers = this->m_strat->get_input_rows() * this->m_strat->get_input_cols();
641 const auto input_point_stride =
input.ld_col * this->m_strat->get_output_cols() *
args.stride_cols;
642 const auto n_output_pointers = this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
643 const auto output_point_stride = output.ld_col * this->m_strat->get_output_cols();
647 for (
unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
649 const int input_i =
static_cast<int>(output_i *
args.stride_rows) -
args.padding.top;
650 int input_j =
static_cast<int>(output_j *
args.stride_cols) -
args.padding.left;
652 Tile<TInput> multiplied_input;
653 this->initialise_inptr_array(
args, output_channel_start, output_channel_end,
input,
654 ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
655 input_i, input_j, 0, 0, multiplied_input);
659 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
660 output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
661 output.ld_row, output.ld_col,
667 for (
unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
671 strat, ws, os,
parameters, m_bias, output_channel_end - output_channel_start
675 if (this->uses_intermediate_array()) {
676 input_j += input_point_stride /
input.ld_col;
677 multiplied_input.load_from(
input.base,
679 args.input_rows,
args.input_cols, input_i, input_j,
args.channel_multiplier);
681 for (
auto i = 0u; i < n_input_pointers; i++)
683 ws->inptr_array[i] += input_point_stride;
687 for (
auto i = 0u; i < n_output_pointers; i++)
689 ws->outptr_array[i] += output_point_stride;
693 output_i += this->m_strat->get_output_rows();