30 #ifdef CYCLE_PROFILING 31 #include "profiler.hpp" 39 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
48 args.kernel_rows, args.kernel_cols,
sizeof(TWeight),
52 [args] (
unsigned int pos,
unsigned int &x,
unsigned int &y) ->
bool 54 if (pos < args.kernel_rows * args.kernel_cols)
56 y = pos % args.kernel_cols;
57 x = pos / args.kernel_cols;
73 void pack_parameters(
const DepthwiseArgs &args,
void *buffer,
const void *biases,
const Nothing &,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
const override 76 this->get_packing_args(args), args,
77 buffer, biases, weights, ld_weight_col, ld_weight_row
82 const TInput *
const *,
92 template <
typename TInput,
typename TWeight,
typename TOutput>
107 interleaves::quantized::pack_parameters<TWeight>(
108 buffer,
reinterpret_cast<const int32_t *
>(biases),
109 reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
115 const TInput *
const *,
125 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
129 const unsigned int m_output_rows, m_output_cols;
133 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
144 const TInput *
const *,
148 unsigned int,
unsigned int,
154 template <
typename TInput,
typename TWeight,
typename TOutput>
158 const unsigned int m_output_rows, m_output_cols;
162 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
173 const TInput *
const *,
177 unsigned int,
unsigned int,
178 const int32_t *,
const int32_t *,
const int32_t *,
184 template <
typename TInput,
185 typename TWeight=TInput,
186 typename TOutput=TInput,
192 std::unique_ptr<KernelStrategyType> m_kern;
198 args.kernel_rows, args.kernel_cols,
sizeof(TWeight),
199 false,
sizeof(TAccum),
202 [args] (
unsigned int pos,
unsigned int &x,
unsigned int &y) ->
bool 204 if (pos < args.kernel_rows * args.kernel_cols)
206 y = pos % args.kernel_cols;
207 x = pos / args.kernel_cols;
219 args.kernel_rows, args.kernel_cols,
220 args.stride_rows, args.stride_cols
234 void pack_parameters(
const DepthwiseArgs &args,
void *buffer,
const void *biases,
const OutputStage &,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
const override 237 this->get_packing_args(args), args,
238 buffer, biases, weights, ld_weight_col, ld_weight_row
244 namespace depthfirst_multiplier {
250 template <
typename T,
bool IsGeneric=false,
typename OutputStage=Nothing>
256 constexpr
static bool InputPatchIsGeneric = IsGeneric;
264 return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
267 template <
class WorkspaceType>
268 static void *
initialise(WorkspaceType *ws,
void *buffer,
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
270 auto buffer_bytes =
reinterpret_cast<char *
>(buffer);
272 ws->input_rows =
reinterpret_cast<const T **
>(buffer_bytes);
273 buffer_bytes += sizeof_input_rows(args);
275 ws->input_padding =
reinterpret_cast<T*
>(buffer_bytes);
276 buffer_bytes += sizeof_input_padding(args);
278 ws->input_patch =
reinterpret_cast<T*
>(buffer_bytes);
279 buffer_bytes += sizeof_input_patch(args);
282 memset(ws->input_padding,
283 get_input_buffer_fill_value(args.output_stage),
284 sizeof_input_padding(args));
290 static size_t sizeof_input_rows(
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
294 return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
298 return sizeof(T *) * args.strategy->get_input_rows();
302 static size_t sizeof_input_padding(
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
305 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 /
sizeof(T));
306 return sizeof(T) * input_cols;
309 static size_t sizeof_input_patch(
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
314 auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 /
sizeof(T));
315 const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
316 return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
321 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 /
sizeof(T));
322 return sizeof(T) * args.strategy->get_input_rows() * input_cols;
327 template <
bool IsGeneric,
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
typename OutputStage>
332 template <
typename WorkspaceType>
334 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
335 const OutputStage &,
const unsigned int,
343 ws->activation_min, ws->activation_max
348 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
typename OutputStage>
349 struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
353 template <
typename WorkspaceType>
355 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
356 const OutputStage &,
const unsigned int start_output_channel,
361 ws->input_rows, ws->outptr_array,
362 reinterpret_cast<const TWeight *
>(
parameters),
363 bias ==
nullptr ?
nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
365 args.channel_multiplier,
366 ws->activation_min, ws->activation_max
371 template <
typename TInput,
typename TWeight,
typename TOutput>
376 template <
typename WorkspaceType>
378 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
392 template <
typename TInput,
typename TWeight,
typename TOutput>
397 template <
typename WorkspaceType>
399 const DepthwiseArgs &
args,
const WorkspaceType *ws,
const Type *strat,
404 auto get_ptr = [start_output_channel] (
const int32_t *ptr) ->
const int32_t *
406 return ptr ==
nullptr ? nullptr : ptr + start_output_channel;
410 ws->input_rows, ws->outptr_array,
411 reinterpret_cast<const TWeight *
>(
parameters),
414 args.channel_multiplier,
427 template <
typename WorkspaceType,
typename StrategyType,
typename T>
429 const DepthwiseArgs &, WorkspaceType *ws,
const StrategyType *strat,
430 T *base_ptr,
size_t ld_row,
size_t ld_col,
431 const unsigned int input_pad_top,
const unsigned int valid_rows,
432 const unsigned int input_pad_left,
const unsigned int valid_cols
436 ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
437 base_ptr, ld_row, ld_col,
439 input_pad_top, valid_rows,
440 input_pad_left, valid_cols
447 template <
typename WorkspaceType,
typename StrategyType,
typename T>
450 T *base_ptr,
size_t ld_row,
size_t ld_col,
451 const unsigned int input_pad_top,
const unsigned int valid_rows,
452 const unsigned int input_pad_left,
const unsigned int valid_cols
456 ws->input_rows, ws->input_patch,
457 strat->get_output_rows(), strat->get_output_cols(),
458 args.kernel_rows, args.kernel_cols,
459 args.stride_rows, args.stride_cols,
460 base_ptr, ld_row, ld_col,
462 input_pad_top, valid_rows,
463 input_pad_left, valid_cols
470 template <
typename TInput,
471 typename TWeight=TInput,
472 typename TOutput=TInput,
474 bool is_generic=
false,
480 using WorkspaceManager = Workspace<
481 OutputArrayElement<TOutput>,
483 ActivationsElement<TOutput, OutputStage>
485 using WorkingSpace =
typename WorkspaceManager::WorkspaceType;
488 const void *m_bias =
nullptr;
501 return reinterpret_cast<const StratType *
>(this->m_strat.get())
505 void pack_parameters(
void *buffer,
const void *biases,
const void *weights,
size_t ld_weight_col,
size_t ld_weight_row)
override 507 reinterpret_cast<const StratType *
>(this->m_strat.get())
508 ->
pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
510 depthwise_depthfirst::stash_bias(m_os, biases);
515 DepthwiseArgs
args(this->m_args);
516 args.input_channels = n_input_channels;
517 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, m_os));
522 DepthwiseArgs
args(this->m_args);
523 args.input_channels = n_input_channels;
524 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, m_os));
528 const DepthwiseArgs &args,
529 unsigned int output_i,
unsigned int output_j,
530 unsigned int output_channel_start,
unsigned int output_channel_end,
534 void *working_space_raw
538 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space_raw);
540 const int ii =
static_cast<int>(output_i * args.stride_rows) - args.padding.top;
541 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
542 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
544 const int ij =
static_cast<int>(output_j * args.stride_cols) - args.padding.left;
545 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
546 const auto input_j =
static_cast<unsigned int>(ij < 0 ? 0 : ij);
551 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
552 output.
base + output_i*output.
ld_row + output_j*output.
ld_col + output_channel_start,
555 0, args.output_rows - output_i,
556 0, args.output_cols - output_j
560 DepthwiseArgs single_iter(args);
561 single_iter.input_channels = 1;
562 const size_t parameter_stride =
reinterpret_cast<const StratType *
>(this->m_strat.get())
565 for (; output_channel_start < output_channel_end;
566 output_channel_start += args.channel_multiplier)
569 const auto input_channel = output_channel_start / args.channel_multiplier;
573 args, ws, this->m_strat.get(),
575 input_pad_top, args.input_rows - input_i,
576 input_pad_left, args.input_cols - input_j
581 args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
586 for (
unsigned int n = 0;
n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols();
n++)
588 ws->outptr_array[
n] += args.channel_multiplier;
592 parameters =
reinterpret_cast<const char *
>(
parameters) + parameter_stride;
arm_gemm::VLType get_vl_type(void) const
const int32_t * per_channel_left_shifts
virtual unsigned int get_accumulator_depth_vl() const
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
void fill_nchw_patch_array(size_t element_size, const void **dest_row_pointers_raw, void *dest_patch_raw, const unsigned int patch_rows, unsigned int patch_cols, const void *src_ptr_raw, size_t ld_row, size_t ld_col, const void *pad_row, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
size_t get_storage_size(void) const override
unsigned int get_kernel_cols() const override
size_t get_storage_size_generic(const PackingArguments &packing_args, const DepthwiseArgs &args)
arm_gemm::VLType get_vl_type(void) const override
void fill_patch_array_generic_kernel(size_t element_size, const void **dest_pointers_raw, void *patch_raw, const unsigned int output_rows, const unsigned int output_cols, const unsigned int kernel_rows, const unsigned int kernel_cols, const unsigned int stride_rows, const unsigned int stride_cols, const void *src_ptr_raw, size_t ld_row, size_t ld_col, const void *pad_row, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
static void execute(const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat, const OutputStage &, const unsigned int, const void *parameters, const void *)
void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
std::function< void(const TInput *const *, TOutput *const *, const TWeight *, const TAccum *, unsigned int, unsigned int, TAccum, TAccum)> KernelType
GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
std::unique_ptr< ParametersLibrary > parameters
unsigned int get_output_cols(void) const
virtual arm_gemm::VLType get_vl_type() const =0
size_t get_storage_size(const DepthwiseArgs &args, const arm_gemm::VLType vl_type, const unsigned int accumulator_depth_vl)
unsigned int get_output_rows(void) const
size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
const KernelStrategyType::KernelType get_kernel(void) const
unsigned int get_output_rows() const override
const int32_t * per_channel_right_shifts
void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
void fill_pointer_array(size_t element_size, void **dest_raw, const unsigned int array_rows, const unsigned int array_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
void compute_tile_padded(const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, unsigned int output_channel_start, unsigned int output_channel_end, const TensorSpec< const TInput *> &input, const TensorSpec< TOutput *> &output, const void *parameters, void *working_space_raw) const override
virtual KernelType get_kernel(void) const =0
GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os={})
std::function< void(const TInput *const *, TOutput *const *, const void *, unsigned int, TAccum, TAccum)> KernelType
unsigned int get_output_cols() const override
const int32_t * per_channel_muls
unsigned int get_kernel_rows() const override
size_t get_storage_size(const DepthwiseArgs &args) const override
void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
void pack_parameters_generic(const PackingArguments &packing_args, const DepthwiseArgs &args, void *buffer_raw, const void *biases_raw, const void *weights_raw, size_t ld_weight_col, size_t ld_weight_row)
size_t get_storage_size(const DepthwiseArgs &args) const override