35 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
44 args.kernel_rows,
args.kernel_cols,
sizeof(TWeight),
45 true,
sizeof(TAccum), this->uses_premultiply(),
48 [
args] (
unsigned int pos,
unsigned int &x,
unsigned int &y) ->
bool
50 if (pos < args.kernel_rows * args.kernel_cols)
52 y = pos % args.kernel_cols;
53 x = pos / args.kernel_cols;
61 bool uses_premultiply()
const override {
73 void pack_parameters(
const DepthwiseArgs &
args,
void *buffer,
const void *biases,
const Nothing &,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
const override
77 buffer, biases, weights, ld_weight_col, ld_weight_row
82 const TInput *
const *,
92 template <
typename TInput,
typename TWeight,
typename TOutput>
107 interleaves::quantized::pack_parameters<TWeight>(
108 buffer,
reinterpret_cast<const int32_t *
>(biases),
109 reinterpret_cast<const TWeight *
>(weights), ld_weight_col, ld_weight_row,
110 args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
115 const TInput *
const *,
121 virtual KernelType get_kernel(
void)
const = 0;
125 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
129 const unsigned int m_output_rows, m_output_cols;
133 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
144 const TInput *
const *,
148 unsigned int,
unsigned int,
151 virtual KernelType get_kernel(
void)
const = 0;
154 template <
typename TInput,
typename TWeight,
typename TOutput>
158 const unsigned int m_output_rows, m_output_cols;
162 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
173 const TInput *
const *,
177 unsigned int,
unsigned int,
178 const int32_t *,
const int32_t *,
const int32_t *,
181 virtual KernelType get_kernel(
void)
const = 0;
184 template <
typename TInput,
185 typename TWeight=TInput,
186 typename TOutput=TInput,
192 std::unique_ptr<KernelStrategyType> m_kern;
198 args.kernel_rows,
args.kernel_cols,
sizeof(TWeight),
199 false,
sizeof(TAccum), this->uses_premultiply(),
202 [
args] (
unsigned int pos,
unsigned int &x,
unsigned int &y) ->
bool
204 if (pos < args.kernel_rows * args.kernel_cols)
206 y = pos % args.kernel_cols;
207 x = pos / args.kernel_cols;
215 bool uses_premultiply()
const override {
222 kern->get_output_rows(), kern->get_output_cols(),
238 void pack_parameters(
const DepthwiseArgs &
args,
void *buffer,
const void *biases,
const OutputStage &,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
const override
242 buffer, biases, weights, ld_weight_col, ld_weight_row
248 namespace depthfirst_multiplier {
254 template <
typename T,
bool IsGeneric=false,
typename OutputStage=Nothing>
260 constexpr
static bool InputPatchIsGeneric = IsGeneric;
268 return sizeof_input_rows(
args) + sizeof_input_padding(
args) + sizeof_input_patch(
args);
271 template <
class WorkspaceType>
272 static void *
initialise(WorkspaceType *ws,
void *buffer,
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
274 auto buffer_bytes =
reinterpret_cast<char *
>(buffer);
276 ws->input_rows =
reinterpret_cast<const T **
>(buffer_bytes);
277 buffer_bytes += sizeof_input_rows(
args);
279 ws->input_padding =
reinterpret_cast<T*
>(buffer_bytes);
280 buffer_bytes += sizeof_input_padding(
args);
282 ws->input_patch =
reinterpret_cast<T*
>(buffer_bytes);
283 buffer_bytes += sizeof_input_patch(
args);
286 memset(ws->input_padding,
287 get_input_buffer_fill_value(
args.output_stage),
288 sizeof_input_padding(
args));
294 static size_t sizeof_input_rows(
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
298 return sizeof(T *) *
args.strategy->get_output_rows() *
args.depthwise_args.kernel_rows *
args.depthwise_args.kernel_cols;
302 return sizeof(T *) *
args.strategy->get_input_rows();
306 static size_t sizeof_input_padding(
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
309 auto input_cols = arm_gemm::roundup<size_t>(
args.strategy->get_input_cols(), 16 /
sizeof(T));
310 return sizeof(T) * input_cols;
313 static size_t sizeof_input_patch(
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
318 auto output_cols = arm_gemm::roundup<size_t>(
args.strategy->get_output_cols(), 16 /
sizeof(T));
319 const auto kernel_points =
args.depthwise_args.kernel_rows *
args.depthwise_args.kernel_cols;
320 return sizeof(T) * kernel_points *
args.strategy->get_output_rows() * output_cols;
325 auto input_cols = arm_gemm::roundup<size_t>(
args.strategy->get_input_cols(), 16 /
sizeof(T));
326 return sizeof(T) *
args.strategy->get_input_rows() * input_cols;
331 template <
bool IsGeneric,
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
typename OutputStage>
336 template <
typename WorkspaceType>
338 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
339 const OutputStage &,
const unsigned int,
347 ws->activation_min, ws->activation_max
352 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
typename OutputStage>
353 struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
357 template <
typename WorkspaceType>
359 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
360 const OutputStage &,
const unsigned int start_output_channel,
365 ws->input_rows, ws->outptr_array,
366 reinterpret_cast<const TWeight *
>(
parameters),
367 bias ==
nullptr ?
nullptr :
reinterpret_cast<const TAccum *
>(
bias) + start_output_channel,
369 args.channel_multiplier,
370 ws->activation_min, ws->activation_max
375 template <
typename TInput,
typename TWeight,
typename TOutput>
380 template <
typename WorkspaceType>
382 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
396 template <
typename TInput,
typename TWeight,
typename TOutput>
401 template <
typename WorkspaceType>
403 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
408 auto get_ptr = [start_output_channel] (
const int32_t *ptr) ->
const int32_t *
410 return ptr ==
nullptr ? nullptr : ptr + start_output_channel;
414 ws->input_rows, ws->outptr_array,
415 reinterpret_cast<const TWeight *
>(
parameters),
418 args.channel_multiplier,
431 template <
typename WorkspaceType,
typename StrategyType,
typename T>
433 const DepthwiseArgs &, WorkspaceType *ws,
const StrategyType *strat,
434 T *base_ptr,
size_t ld_row,
size_t ld_col,
435 const unsigned int input_pad_top,
const unsigned int valid_rows,
436 const unsigned int input_pad_left,
const unsigned int valid_cols
440 ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
441 base_ptr, ld_row, ld_col,
443 input_pad_top, valid_rows,
444 input_pad_left, valid_cols
451 template <
typename WorkspaceType,
typename StrategyType,
typename T>
454 T *base_ptr,
size_t ld_row,
size_t ld_col,
455 const unsigned int input_pad_top,
const unsigned int valid_rows,
456 const unsigned int input_pad_left,
const unsigned int valid_cols
460 ws->input_rows, ws->input_patch,
461 strat->get_output_rows(), strat->get_output_cols(),
464 base_ptr, ld_row, ld_col,
466 input_pad_top, valid_rows,
467 input_pad_left, valid_cols
474 template <
typename TInput,
475 typename TWeight=TInput,
476 typename TOutput=TInput,
478 bool is_generic=
false,
479 typename OutputStage=
typename DefaultOutputStage<TOutput>::Type>
484 using WorkspaceManager = Workspace<
485 OutputArrayElement<TOutput>,
487 ActivationsElement<TOutput, OutputStage>
489 using WorkingSpace =
typename WorkspaceManager::WorkspaceType;
492 const void *m_bias =
nullptr;
494 bool uses_premultiply()
const override {
504 DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) =
delete;
505 DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) =
delete;
509 return reinterpret_cast<const StratType *
>(this->m_strat.get())
513 void pack_parameters(
void *buffer,
const void *biases,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
override
515 reinterpret_cast<const StratType *
>(this->m_strat.get())
516 ->
pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
518 depthwise_depthfirst::stash_bias(m_os, biases);
523 DepthwiseArgs
args(this->m_args);
524 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, m_os));
529 DepthwiseArgs
args(this->m_args);
530 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, m_os));
534 const DepthwiseArgs &
args,
535 unsigned int output_i,
unsigned int output_j,
536 unsigned int output_channel_start,
unsigned int output_channel_end,
540 void *working_space_raw
544 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space_raw);
546 const int ii =
static_cast<int>(output_i *
args.stride_rows) -
args.padding.top;
547 const auto input_pad_top =
static_cast<unsigned int>(ii < 0 ? -ii : 0);
548 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
550 const int ij =
static_cast<int>(output_j *
args.stride_cols) -
args.padding.left;
551 const auto input_pad_left =
static_cast<unsigned int>(ij < 0 ? -ij : 0);
552 const auto input_j =
static_cast<unsigned int>(ij < 0 ? 0 : ij);
557 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
558 output.
base + output_i*output.
ld_row + output_j*output.
ld_col + output_channel_start,
561 0,
args.output_rows - output_i,
562 0,
args.output_cols - output_j
566 DepthwiseArgs single_iter(
args);
567 single_iter.input_channels = 1;
568 const size_t parameter_stride =
reinterpret_cast<const StratType *
>(this->m_strat.get())
571 for (; output_channel_start < output_channel_end;
572 output_channel_start +=
args.channel_multiplier)
575 const auto input_channel = output_channel_start /
args.channel_multiplier;
579 args, ws, this->m_strat.get(),
581 input_pad_top,
args.input_rows - input_i,
582 input_pad_left,
args.input_cols - input_j
587 args, ws,
reinterpret_cast<const StratType *
>(this->m_strat.get()), m_os, output_channel_start,
592 for (
unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
594 ws->outptr_array[n] +=
args.channel_multiplier;