29 #ifdef CYCLE_PROFILING 30 #include "profiler.hpp" 40 template <
typename strategy,
typename F>
43 typename strategy::input_type pad_value,
44 const DepthwiseArgs &
args,
48 const unsigned int input_channels,
49 const PaddingValues &padding,
50 const void *
const _input,
51 const size_t ld_input_col,
52 const size_t ld_input_row,
53 const size_t ld_input_batch,
55 const size_t param_stride,
56 const unsigned int output_height,
57 const unsigned int output_width,
59 const size_t ld_output_col,
60 const size_t ld_output_row,
61 const size_t ld_output_batch,
62 void *
const _working_space,
63 const unsigned int thread_id,
64 const unsigned int n_threads
67 using TInput =
typename strategy::input_type;
68 using TOutput =
typename strategy::return_type;
72 const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
73 const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
76 const TInput *
const inptr =
static_cast<const TInput *
>(_input);
77 TOutput *
const outptr =
static_cast<TOutput *
>(_output);
83 TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*(16 /
sizeof(TInput))];
84 const TInput *inptrs[strategy::input_rows];
87 TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
88 TOutput **
const outptr_array = _outptr_array;
91 uint8_t *
const working_space =
static_cast<uint8_t *
>(_working_space);
92 TOutput *
const output_buffer =
reinterpret_cast<TOutput *
>(working_space);
96 for (
unsigned int batch = 0; batch <
batches; batch++)
99 const auto inptr_batch = inptr + batch * ld_input_batch;
100 const auto outptr_batch = outptr + batch * ld_output_batch;
102 for (
int start_out_i = start_out_height;
103 start_out_i < end_out_height;
104 start_out_i +=
static_cast<int>(strategy::output_rows))
106 const int end_out_i = start_out_i + strategy::output_rows;
107 const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
108 const int end_in_i = start_in_i + strategy::input_rows;
111 const auto pad_top =
static_cast<unsigned int>(-std::min(start_in_i, 0));
112 const auto pad_bottom =
static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
113 const unsigned int valid_output_rows = std::min(
114 end_out_i - start_out_i,
115 static_cast<int>(output_height) - start_out_i
118 for (
int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
120 const int start_in_j = start_out_j * strategy::stride_cols - args.padding.left;
121 const int pad_left = -std::min(0, start_in_j);
123 const int end_out_j = start_out_j + strategy::output_cols;
124 const int end_in_j = start_in_j + strategy::input_cols;
126 const auto pad_right =
static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
127 const unsigned int valid_output_cols = std::min(
128 end_out_j - start_out_j,
129 static_cast<int>(output_width) - start_out_j
133 TOutput **outptr_pos = outptr_array;
134 for (
auto i = 0u; i < valid_output_rows; i++)
137 TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
138 for (; j < valid_output_cols; j++)
140 *(outptr_pos++) = colptr;
141 colptr += ld_output_col;
143 for (; j < strategy::output_cols; j++)
145 *(outptr_pos++) = output_buffer;
148 for (
auto i = valid_output_rows; i < strategy::output_rows; i++)
150 for (
auto j = 0u; j < strategy::output_cols; j++)
152 *(outptr_pos++) = output_buffer;
156 start_out_j += strategy::output_cols;
158 const uint8_t *params =
static_cast<const uint8_t *
>(
parameters);
161 for (
unsigned int in_c = 0; in_c < input_channels; in_c++)
165 for (
unsigned int i = 0; i < strategy::input_rows; i++)
167 for (
unsigned int j = 0;
168 j < (16 /
sizeof(TInput)) * strategy::input_col_quads; j++)
170 rearranged_input[i][j] = pad_value;
172 inptrs[i] = rearranged_input[i];
175 auto inptr_row = inptr_batch + in_c +
176 (start_in_i + pad_top) * ld_input_row +
177 (start_in_j + pad_left) * ld_input_col;
178 if (ld_input_col == 1 && !pad_left &&
179 start_in_j + (16 /
sizeof(TInput)) * strategy::input_col_quads < input_width)
184 for (
unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
186 inptrs[i] = inptr_row;
187 inptr_row += ld_input_row;
195 for (
unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
197 auto inptr_col = inptr_row;
198 for (
unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
200 rearranged_input[i][j] = *inptr_col;
201 inptr_col += ld_input_col;
203 inptr_row += ld_input_row;
207 execute_tile(inptrs, outptr_array, params);
210 TOutput **outptr_pos = outptr_array;
211 for (
auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
213 outptr_pos[i] += args.channel_multiplier;
217 params += param_stride;
225 template <
class strategy>
227 public DepthwiseCommon<typename strategy::input_type,
228 typename strategy::weight_type,
229 typename strategy::return_type>
231 using TInput =
typename strategy::input_type;
232 using TWeight =
typename strategy::weight_type;
233 using TOutput =
typename strategy::return_type;
234 using TAccum =
typename strategy::bias_type;
236 size_t sizeof_output_buffer(
unsigned int n_channels)
const 238 const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
240 return sizeof(TOutput) * rounded_channels;
254 const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
255 const auto rounded_channels = this->m_args.input_channels *
arm_gemm::roundup(this->m_args.channel_multiplier, vl);
256 return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels *
sizeof(TWeight);
259 void pack_parameters(
void *_buffer,
const void *_biases,
const void *_weights,
size_t ld_weight_col,
size_t ld_weight_row)
override 264 float *buffer =
static_cast<float *
>(_buffer);
265 const float *biases =
static_cast<const float *
>(_biases);
266 const float *
const weights =
static_cast<const float *
>(_weights);
268 const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
269 ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col;
270 ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
272 for (
unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++)
274 for (
unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl)
276 const unsigned int out_c = in_c * this->m_args.channel_multiplier + n;
277 const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n);
280 for (
unsigned int i = 0; i < todo; i++)
282 buffer[i] = (biases ==
nullptr) ? 0 : biases[out_c + i];
287 auto weights_row = weights + out_c;
288 for (
unsigned int i = 0; i < this->m_args.kernel_rows; i++)
290 auto weights_col = weights_row;
292 for (
unsigned int j = 0; j < this->m_args.kernel_cols; j++)
294 for (
unsigned int m = 0; m < todo; m++)
296 buffer[m] = weights_col[m];
300 weights_col += ld_weight_col;
303 weights_row += ld_weight_row;
309 size_t get_working_size(
const unsigned int n_threads,
const unsigned int n_channels)
const override 311 const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
312 return n_threads * sizeof_output_buffer(n_output_channels);
315 using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
320 const unsigned int input_channels,
321 const PaddingValues &padding,
322 const void *
const _input,
323 const size_t ld_input_col,
324 const size_t ld_input_row,
325 const size_t ld_input_batch,
327 const unsigned int output_height,
328 const unsigned int output_width,
330 const size_t ld_output_col,
331 const size_t ld_output_row,
332 const size_t ld_output_batch,
333 void *
const _working_space,
334 const unsigned int thread_id,
335 const unsigned int n_threads
338 strategy strat(this->m_args.cpu_info);
339 #ifdef CYCLE_PROFILING 340 arm_gemm::profiler prof;
344 TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::min();
345 TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() : std::numeric_limits<TAccum>::max();
347 switch (this->m_args.activation.type)
350 activation_max =
static_cast<TAccum
>(this->m_args.activation.param1);
353 activation_min =
static_cast<TAccum
>(0);
361 const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
362 const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
365 const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
366 const unsigned int param_stride =
368 (
sizeof(TAccum) +
sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
371 const TInput *
const inptr =
static_cast<const TInput *
>(_input);
372 TOutput *
const outptr =
static_cast<TOutput *
>(_output);
378 TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
379 const TInput *inptrs[strategy::input_rows];
382 TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
383 TOutput **
const outptr_array = _outptr_array;
386 uint8_t *
const working_space =
static_cast<uint8_t *
>(_working_space) + get_working_size(thread_id, input_channels);
387 TOutput *
const output_buffer =
reinterpret_cast<TOutput *
>(working_space);
391 for (
unsigned int batch = 0; batch <
batches; batch++)
394 const auto inptr_batch = inptr + batch * ld_input_batch;
395 const auto outptr_batch = outptr + batch * ld_output_batch;
397 for (
int start_out_i = start_out_height;
398 start_out_i < end_out_height;
399 start_out_i +=
static_cast<int>(strategy::output_rows))
401 const int end_out_i = start_out_i + strategy::output_rows;
402 const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
403 const int end_in_i = start_in_i + strategy::input_rows;
406 const auto pad_top =
static_cast<unsigned int>(-std::min(start_in_i, 0));
407 const auto pad_bottom =
static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
408 const unsigned int valid_output_rows = std::min(
409 end_out_i - start_out_i,
410 static_cast<int>(output_height) - start_out_i
413 for (
int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
415 const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
416 const int pad_left = -std::min(0, start_in_j);
418 const int end_out_j = start_out_j + strategy::output_cols;
419 const int end_in_j = start_in_j + strategy::input_cols;
421 const auto pad_right =
static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
422 const unsigned int valid_output_cols = std::min(
423 end_out_j - start_out_j,
424 static_cast<int>(output_width) - start_out_j
428 TOutput **outptr_pos = outptr_array;
429 for (
auto i = 0u; i < valid_output_rows; i++)
432 TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
433 for (; j < valid_output_cols; j++)
435 *(outptr_pos++) = colptr;
436 colptr += ld_output_col;
438 for (; j < strategy::output_cols; j++)
440 *(outptr_pos++) = output_buffer;
443 for (
auto i = valid_output_rows; i < strategy::output_rows; i++)
445 for (
auto j = 0u; j < strategy::output_cols; j++)
447 *(outptr_pos++) = output_buffer;
451 start_out_j += strategy::output_cols;
453 const uint8_t *params =
static_cast<const uint8_t *
>(
parameters);
456 for (
unsigned int in_c = 0; in_c < input_channels; in_c++)
460 for (
unsigned int i = 0; i < strategy::input_rows; i++)
462 for (
unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
464 rearranged_input[i][j] =
static_cast<TInput
>(0);
466 inptrs[i] = rearranged_input[i];
469 auto inptr_row = inptr_batch + in_c +
470 (start_in_i + pad_top) * ld_input_row +
471 (start_in_j + pad_left) * ld_input_col;
472 if (ld_input_col == 1 && !pad_left &&
473 start_in_j + 4 * strategy::input_col_quads < input_width)
478 for (
unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
480 inptrs[i] = inptr_row;
481 inptr_row += ld_input_row;
489 for (
unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
491 auto inptr_col = inptr_row;
492 for (
unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
494 rearranged_input[i][j] = *inptr_col;
495 inptr_col += ld_input_col;
497 inptr_row += ld_input_row;
502 #ifdef CYCLE_PROFILING 503 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
506 inptrs, outptr_array, params,
507 this->m_args.channel_multiplier,
508 activation_min, activation_max
513 TOutput **outptr_pos = outptr_array;
514 for (
auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
516 outptr_pos[i] += this->m_args.channel_multiplier;
520 params += param_stride;
T roundup(const T a, const T b)
DepthwiseDepthfirstWithMultiplier(const DepthwiseArgs &args)
T iceildiv(const T a, const T b)
size_t get_storage_size(void) const override
size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
std::unique_ptr< ParametersLibrary > parameters
void depthwise_multiplier_execute(const F execute_tile, typename strategy::input_type pad_value, const DepthwiseArgs &args, const unsigned int batches, const unsigned int input_height, const unsigned int input_width, const unsigned int input_channels, const PaddingValues &padding, const void *const _input, const size_t ld_input_col, const size_t ld_input_row, const size_t ld_input_batch, const void *const parameters, const size_t param_stride, const unsigned int output_height, const unsigned int output_width, void *const _output, const size_t ld_output_col, const size_t ld_output_row, const size_t ld_output_batch, void *const _working_space, const unsigned int thread_id, const unsigned int n_threads)
const size_t input_height
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
void execute(const unsigned int batches, const unsigned int input_height, const unsigned int input_width, const unsigned int input_channels, const PaddingValues &padding, const void *const _input, const size_t ld_input_col, const size_t ld_input_row, const size_t ld_input_batch, const void *const parameters, const unsigned int output_height, const unsigned int output_width, void *const _output, const size_t ld_output_col, const size_t ld_output_row, const size_t ld_output_batch, void *const _working_space, const unsigned int thread_id, const unsigned int n_threads) const override