338 strategy strat(this->m_args.cpu_info);
339 #ifdef CYCLE_PROFILING 340 arm_gemm::profiler prof;
344 TAccum activation_min = std::numeric_limits<TAccum>::has_infinity ? -std::numeric_limits<TAccum>::infinity() :
std::numeric_limits<TAccum>::min();
345 TAccum activation_max = std::numeric_limits<TAccum>::has_infinity ? std::numeric_limits<TAccum>::infinity() :
std::numeric_limits<TAccum>::max();
347 switch (this->m_args.activation.type)
350 activation_max =
static_cast<TAccum
>(this->m_args.activation.param1);
353 activation_min =
static_cast<TAccum
>(0);
361 const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
362 const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
365 const unsigned int vl = arm_gemm::utils::get_vector_length<TOutput>(strategy::vl_type);
366 const unsigned int param_stride =
368 (
sizeof(TAccum) +
sizeof(TWeight) * strategy::kernel_rows * strategy::kernel_cols);
371 const TInput *
const inptr =
static_cast<const TInput *
>(_input);
372 TOutput *
const outptr =
static_cast<TOutput *
>(_output);
378 TInput rearranged_input[strategy::input_rows][strategy::input_col_quads*4];
379 const TInput *inptrs[strategy::input_rows];
382 TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
383 TOutput **
const outptr_array = _outptr_array;
386 uint8_t *
const working_space =
static_cast<uint8_t *
>(_working_space) +
get_working_size(thread_id, input_channels);
387 TOutput *
const output_buffer =
reinterpret_cast<TOutput *
>(working_space);
391 for (
unsigned int batch = 0; batch <
batches; batch++)
394 const auto inptr_batch = inptr + batch * ld_input_batch;
395 const auto outptr_batch = outptr + batch * ld_output_batch;
397 for (
int start_out_i = start_out_height;
398 start_out_i < end_out_height;
399 start_out_i +=
static_cast<int>(strategy::output_rows))
401 const int end_out_i = start_out_i + strategy::output_rows;
402 const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
403 const int end_in_i = start_in_i + strategy::input_rows;
406 const auto pad_top =
static_cast<unsigned int>(-std::min(start_in_i, 0));
407 const auto pad_bottom =
static_cast<unsigned int>(-std::min(static_cast<int>(
input_height) - end_in_i, 0));
408 const unsigned int valid_output_rows = std::min(
409 end_out_i - start_out_i,
410 static_cast<int>(output_height) - start_out_i
413 for (
int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
415 const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
416 const int pad_left = -std::min(0, start_in_j);
418 const int end_out_j = start_out_j + strategy::output_cols;
419 const int end_in_j = start_in_j + strategy::input_cols;
421 const auto pad_right =
static_cast<unsigned int>(-std::min(static_cast<int>(
input_width) - end_in_j, 0));
422 const unsigned int valid_output_cols = std::min(
423 end_out_j - start_out_j,
424 static_cast<int>(output_width) - start_out_j
428 TOutput **outptr_pos = outptr_array;
429 for (
auto i = 0u; i < valid_output_rows; i++)
432 TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
433 for (; j < valid_output_cols; j++)
435 *(outptr_pos++) = colptr;
436 colptr += ld_output_col;
438 for (; j < strategy::output_cols; j++)
440 *(outptr_pos++) = output_buffer;
443 for (
auto i = valid_output_rows; i < strategy::output_rows; i++)
445 for (
auto j = 0u; j < strategy::output_cols; j++)
447 *(outptr_pos++) = output_buffer;
451 start_out_j += strategy::output_cols;
453 const uint8_t *params =
static_cast<const uint8_t *
>(
parameters);
456 for (
unsigned int in_c = 0; in_c < input_channels; in_c++)
460 for (
unsigned int i = 0; i < strategy::input_rows; i++)
462 for (
unsigned int j = 0; j < 4 * strategy::input_col_quads; j++)
464 rearranged_input[i][j] =
static_cast<TInput
>(0);
466 inptrs[i] = rearranged_input[i];
469 auto inptr_row = inptr_batch + in_c +
470 (start_in_i + pad_top) * ld_input_row +
471 (start_in_j + pad_left) * ld_input_col;
472 if (ld_input_col == 1 && !pad_left &&
473 start_in_j + 4 * strategy::input_col_quads <
input_width)
478 for (
unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
480 inptrs[i] = inptr_row;
481 inptr_row += ld_input_row;
489 for (
unsigned int i = pad_top; i < strategy::input_rows - pad_bottom; i++)
491 auto inptr_col = inptr_row;
492 for (
unsigned int j = pad_left; j < strategy::input_cols - pad_right; j++)
494 rearranged_input[i][j] = *inptr_col;
495 inptr_col += ld_input_col;
497 inptr_row += ld_input_row;
502 #ifdef CYCLE_PROFILING 503 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.channel_multiplier * strategy::kernel_rows * strategy::kernel_cols));
506 inptrs, outptr_array, params,
507 this->m_args.channel_multiplier,
508 activation_min, activation_max
513 TOutput **outptr_pos = outptr_array;
514 for (
auto i = 0u; i < strategy::output_rows * strategy::output_cols; i++)
516 outptr_pos[i] += this->m_args.channel_multiplier;
520 params += param_stride;
T roundup(const T a, const T b)
T iceildiv(const T a, const T b)
size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
std::unique_ptr< ParametersLibrary > parameters
const size_t input_height