32 template <
typename TInput,
typename TOutput,
typename TAccum>
35 using KernelType = std::function<void(
const TInput *
const *
const, TOutput *
const *
const,
const void *,
const void *,
const unsigned int,
const unsigned int,
const TAccum,
const TAccum)>;
38 template <
typename TInput,
typename TOutput>
44 template <
typename TInput,
typename TWeight,
typename TOutput,
typename TAccum>
47 unsigned int m_n_output_points;
49 unsigned int m_accumulator_depth_vl;
53 : m_n_output_points(n_output_points), m_vl_type(vl_type), m_accumulator_depth_vl(accumulator_depth_vl)
67 template <
typename TInput,
68 typename TWeight=TInput,
69 typename TOutput=TInput,
76 std::unique_ptr<KernelStrategyType> m_strategy;
81 const DepthwiseArgs &
args
84 n_output_rows, n_output_cols,
104 [
this] (
unsigned int idx,
unsigned int &x,
unsigned int &y) ->
bool
111 const DepthwiseArgs &
args,
void *buffer,
112 const void *biases,
const OutputStage &,
113 const void *weights,
size_t ld_weight_col,
size_t ld_weight_row
120 [
this] (
unsigned int idx,
unsigned int &x,
unsigned int &y) ->
bool
124 packing_args,
args, buffer, biases, weights, ld_weight_col, ld_weight_row);
136 template <
typename StratType,
typename WorkspaceType,
typename TAccum>
138 const StratType *strat,
const WorkspaceType *ws,
const Nothing &,
139 const TAccum *
bias,
const void *params,
140 const unsigned int n_kernel_points,
const unsigned int n_output_channels
147 n_kernel_points, n_output_channels,
148 ws->activation_min, ws->activation_max
156 template <
typename StratType,
typename WorkspaceType>
159 const int32_t *,
const void *params,
160 const unsigned int n_kernel_points,
const unsigned int n_output_channels
167 n_kernel_points, n_output_channels
176 template <
typename T>
185 template <
class OutputStage>
188 const auto kernel_points =
args.depthwise_args.kernel_rows *
args.depthwise_args.kernel_cols;
189 return sizeof(T **) *
args.strategy->get_output_rows() *
args.strategy->get_output_cols() * kernel_points;
192 template <
class WorkspaceType,
class OutputStage>
193 static void *
initialise(WorkspaceType *ws,
void *buffer,
const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &
args)
195 ws->inptr_array =
reinterpret_cast<const T**
>(buffer);
200 template <
typename TInput,
typename TWeight=TInput,
typename TOutput=TInput,
207 using WorkspaceManager = Workspace<
208 OutputArrayElement<TOutput>,
210 InputBufferElement<TInput>,
211 IntermediateBufferElement<TInput>,
212 ActivationsElement<TAccum, OutputStage>
214 using WorkingSpace =
typename WorkspaceManager::WorkspaceType;
215 const TAccum *m_bias =
nullptr;
219 : Parent(strat,
args, os)
227 void *buffer,
const void *biases,
228 const void *weights,
size_t ld_weight_col,
size_t ld_weight_row
232 m_bias =
reinterpret_cast<const TAccum *
>(biases);
233 depthwise_depthfirst::stash_bias(this->get_output_stage(), m_bias);
238 DepthwiseArgs
args(this->m_args);
239 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, this->get_output_stage()));
244 DepthwiseArgs
args(this->m_args);
245 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(),
args, this->get_output_stage()));
249 void fill_inptr_array(
const DepthwiseArgs &
args,
252 const unsigned int input_i,
const unsigned int input_j,
253 const unsigned int input_pad_top,
const unsigned int input_pad_left)
const override
255 fill_pointer_array_generic_kernel<const TInput>(
257 this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
263 input_pad_top,
args.input_rows - input_i,
264 input_pad_left,
args.input_cols - input_j
268 void compute_tile_padded(
269 const DepthwiseArgs &
args,
270 unsigned int output_i,
unsigned int output_j,
271 unsigned int channel_start,
unsigned int channel_end,
272 const TensorSpec<const TInput *> &
input,
273 const TensorSpec<TOutput *> &output,
275 void *working_space_raw
279 WorkingSpace *ws =
reinterpret_cast<WorkingSpace *
>(working_space_raw);
281 const int ii =
static_cast<int>(output_i *
args.stride_rows) -
args.padding.top;
282 const auto input_pad_top =
static_cast<unsigned int>(ii < 0 ? -ii : 0);
283 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
285 const int ij =
static_cast<int>(output_j *
args.stride_cols) -
args.padding.left;
286 const auto input_pad_left =
static_cast<unsigned int>(ij < 0 ? -ij : 0);
287 const auto input_j =
static_cast<unsigned int>(ij < 0 ? 0 : ij);
289 Tile<TInput> multiplied_input;
290 this->initialise_inptr_array(
args, channel_start, channel_end,
input,
291 ws->inptr_array, ws->input_buffer, ws->intermediate_buffer,
292 input_i, input_j, input_pad_top, input_pad_left, multiplied_input);
295 fill_pointer_array<TOutput>(
296 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
297 output.base + output_i*output.ld_row + output_j*output.ld_col + channel_start,
298 output.ld_row, output.ld_col,
300 0,
args.output_rows - output_i,
301 0,
args.output_cols - output_j
305 DepthwiseDepthfirstGenericKernelCall<OutputStage>::execute(
306 reinterpret_cast<const StratType *
>(this->m_strat.get()), ws,
308 args.kernel_rows *
args.kernel_cols,
309 channel_end - channel_start