33 template <
typename OutputStage>
43 const DepthwiseArgs &
args,
void *buffer,
44 const void *biases,
const OutputStage &,
45 const void *weights,
size_t ld_weight_col,
size_t ld_weight_row
50 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum,
54 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
58 const TInput *,
size_t ld_in_row,
size_t ld_in_col,
size_t ld_in_vl,
59 unsigned int pad_top,
unsigned int valid_input_rows,
60 unsigned int pad_left,
unsigned int valid_input_cols,
61 const TWeight *,
const TAccum *,
62 TOutput **,
const size_t *,
const size_t *,
unsigned int output_cols,
63 unsigned int start_channels,
unsigned int valid_channels,
64 TAccum act_min, TAccum act_max
67 template <
typename WorkspaceType>
70 const TInput *inptr,
size_t ld_in_row,
size_t ld_in_col,
size_t ld_in_vl,
71 unsigned int pad_top,
unsigned int valid_input_rows,
72 unsigned int pad_left,
unsigned int valid_input_cols,
73 const TWeight *weights,
const TAccum *
bias,
74 TOutput **outptrs,
const size_t *outlds,
const size_t *outvllds,
unsigned int output_cols,
75 unsigned int start_channel,
unsigned int valid_channels,
76 const Nothing &,
const WorkspaceType *ws
80 inptr, ld_in_row, ld_in_col, ld_in_vl,
81 pad_top, valid_input_rows,
82 pad_left, valid_input_cols,
84 outptrs, outlds, outvllds, output_cols,
85 start_channel, valid_channels,
86 ws->activation_min, ws->activation_max
91 template <
typename TInput,
typename TWeight,
typename TOutput>
95 const TInput *,
size_t ld_in_row,
size_t ld_in_col,
size_t ld_in_vl,
96 unsigned int pad_top,
unsigned int valid_input_rows,
97 unsigned int pad_left,
unsigned int valid_input_cols,
99 TOutput **,
const size_t *,
const size_t *,
unsigned int output_cols,
100 unsigned int start_channel,
unsigned int valid_channels,
104 template <
typename WorkspaceType>
107 const TInput *inptr,
size_t ld_in_row,
size_t ld_in_col,
size_t ld_in_vl,
108 unsigned int pad_top,
unsigned int valid_input_rows,
109 unsigned int pad_left,
unsigned int valid_input_cols,
110 const TWeight *weights,
const int32_t *,
111 TOutput **outptrs,
const size_t *outlds,
const size_t *outldvls,
unsigned int output_cols,
112 unsigned int first_channel,
unsigned int valid_channels,
117 inptr, ld_in_row, ld_in_col, ld_in_vl,
118 pad_top, valid_input_rows,
119 pad_left, valid_input_cols,
121 outptrs, outlds, outldvls, output_cols,
122 first_channel, valid_channels,
129 template <
typename TInput,
typename TWeight=TInput,
typename TOutput=TInput,
134 unsigned int m_kernel_rows, m_kernel_cols;
135 unsigned int m_stride_rows, m_stride_cols;
136 unsigned int m_output_rows;
140 virtual bool get_kernel_packing_point(
const unsigned int index,
unsigned int &x,
unsigned int &y)
const
144 if (m_kernel_rows * m_kernel_cols <= index)
147 y = index % m_kernel_cols;
148 x = index / m_kernel_cols;
155 m_kernel_rows, m_kernel_cols,
sizeof(TWeight),
156 false,
sizeof(TAccum),
true,
157 m_vl_type,
sizeof(TAccum), 1,
158 [
this] (
unsigned int idx,
unsigned int &x,
unsigned int &y) ->
bool
159 {
return this->get_kernel_packing_point(idx, x, y); }
165 unsigned int kernel_rows,
unsigned int kernel_cols,
166 unsigned int stride_rows,
unsigned int stride_cols,
167 unsigned int output_rows,
169 ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
170 m_stride_rows(stride_rows), m_stride_cols(stride_cols),
171 m_output_rows(output_rows), m_vl_type(vl_type)
184 const DepthwiseArgs &
args,
void *buffer,
185 const void *biases,
const OutputStage &,
186 const void *weights,
size_t ld_weight_col,
size_t ld_weight_row
190 this->get_kernel_packing_arguments(),
args,
191 buffer, biases, weights, ld_weight_col, ld_weight_row
202 template <
typename T>
203 struct OutputRowPtrsElement
213 template <
typename OutputStage>
214 static size_t get_element_size(
const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &
args)
218 return args.strategy->get_output_rows() * (
sizeof(T *) + 2*
sizeof(
size_t)) +
219 get_vector_length<char>(
args.strategy->get_vl_type());
222 template <
typename WorkspaceType,
typename OutputStage>
223 static void *initialise(WorkspaceType *ws,
void *buffer,
224 const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &
args)
226 const auto n_rows =
args.strategy->get_output_rows();
227 ws->output_row_ptrs =
reinterpret_cast<T **
>(buffer);
228 ws->output_ld_cols =
reinterpret_cast<size_t *
>(ws->output_row_ptrs + n_rows);
229 ws->output_ld_vls = ws->output_ld_cols + n_rows;
230 ws->output_padding_buffer =
reinterpret_cast<T *
>(ws->output_ld_vls + n_rows);
231 return ws->output_padding_buffer + get_vector_length<T>(
args.strategy->get_vl_type());
238 template <
typename TInput,
typename TWeight=TInput,
typename TOutput=TInput,
243 using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
245 using WorkspaceManager = Workspace<
246 OutputRowPtrsElement<TOutput>,
247 ActivationsElement<TAccum, OutputStage>
249 using WorkspaceType =
typename WorkspaceManager::WorkspaceType;
251 std::unique_ptr<StrategyType> m_strat;
252 const TAccum *m_bias;
257 : Parent(
args), m_strat(strat), m_bias(
nullptr), m_os(os)
266 return m_strat->get_storage_size(this->m_args);
270 void *buffer,
const void *biases,
271 const void *weights,
size_t ld_weight_col,
size_t ld_weight_row
274 m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
275 this->m_bias =
reinterpret_cast<const TAccum *
>(biases);
276 depthwise_depthfirst::stash_bias(this->m_os, biases);
281 return this->get_working_size_per_thread() * n_threads;
286 virtual size_t get_working_size_per_thread(
void)
const
288 return WorkspaceManager::get_sizeof_workspace(
293 virtual void initialise_working_space(
void *buffer)
const
295 WorkspaceManager::initialise(
297 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
302 virtual void execute_kernel(
303 const TInput *inptr,
size_t ld_in_row,
size_t ld_in_col,
size_t ld_in_vl,
304 unsigned int pad_top,
unsigned int valid_input_rows,
305 unsigned int pad_left,
unsigned int valid_input_cols,
306 const TWeight *weights,
const TAccum *
bias,
307 TOutput *outptr,
size_t ld_out_row,
size_t ld_out_col,
size_t ld_out_vl,
308 unsigned int valid_output_rows,
unsigned int valid_output_cols,
309 unsigned int first_channel,
unsigned int valid_channels,
314 for (
auto i = 0u; i < m_strat->get_output_rows(); i++)
318 ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
319 ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
320 ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
321 outptr += ld_out_row;
325 PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
326 reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *
>(m_strat.get())->get_kernel(),
327 inptr, ld_in_row, ld_in_col, ld_in_vl,
328 pad_top, valid_input_rows, pad_left, valid_input_cols,
330 ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
331 valid_output_cols, first_channel, valid_channels,
336 void execute_internal(
337 const DepthwiseArgs &
args,
341 size_t ld_input_batch,
344 size_t ld_output_col,
345 size_t ld_output_row,
346 size_t ld_output_batch,
348 unsigned int thread_id,
349 unsigned int n_threads
353 void *thread_working_space =
354 static_cast<uint8_t *
>(working_space) + thread_id * this->get_working_size_per_thread();
355 this->initialise_working_space(thread_working_space);
356 auto ws =
reinterpret_cast<WorkspaceType *
>(thread_working_space);
358 const auto n_output_channels =
args.input_channels *
args.channel_multiplier;
359 const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
362 auto input_batch =
reinterpret_cast<const TInput *
>(
input);
363 auto output_batch =
reinterpret_cast<TOutput *
>(output);
364 auto weights =
reinterpret_cast<const TWeight *
>(
parameters);
373 for (
auto start_output_i = thread_id * m_strat->get_output_rows();
374 start_output_i <
args.output_rows;
375 start_output_i += n_threads * m_strat->get_output_rows())
379 const int start_input_i = start_output_i *
args.stride_rows -
args.padding.top;
380 const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
381 const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
382 const unsigned int valid_input_rows = input_i >
args.input_rows ? 0 :
args.input_rows - input_i;
383 const unsigned int valid_output_rows =
args.output_rows - start_output_i;
385 auto inptr_row = input_batch + input_i*ld_input_row;
386 auto outptr_row = output_batch + start_output_i * ld_output_row;
389 this->execute_kernel(
390 inptr_row, ld_input_row, ld_input_col, vl,
391 input_pad_top, valid_input_rows,
args.padding.left,
args.input_cols,
392 weights, this->m_bias,
393 outptr_row, ld_output_row, ld_output_col, vl,
394 valid_output_rows,
args.output_cols,
395 0 , n_output_channels,
401 input_batch += ld_input_batch;
402 output_batch += ld_output_batch;