29 #include "arm_gemm.hpp" 37 #ifdef CYCLE_PROFILING 38 #include "profiler.hpp" 43 #define ALLOC_ROUND 64 44 #define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) 59 template<
bool MergeStep,
typename OutputStage>
60 class kernel_and_merge {
62 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
64 #ifdef CYCLE_PROFILING
67 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *c_panel,
68 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
69 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
76 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
78 #ifdef CYCLE_PROFILING
81 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *c_panel,
82 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
83 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
86 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
89 #ifdef CYCLE_PROFILING 90 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
93 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
97 #ifdef CYCLE_PROFILING 98 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() *
sizeof(Tr)));
100 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act,
accumulate);
106 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
108 #ifdef CYCLE_PROFILING
111 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *,
112 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
unsigned int m_max,
113 unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
117 #ifdef CYCLE_PROFILING 118 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
125 if (c_ptr ==
nullptr) {
126 offset_c_ptr =
nullptr;
128 offset_c_ptr = c_ptr + m_0 * ldc + n_0;
136 m_max-m_0, n_max - n_0, kern_k,
138 biasptr ? biasptr + n_0 :
nullptr, act,
accumulate,
145 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
147 #ifdef CYCLE_PROFILING
150 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *,
151 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
unsigned int m_max,
152 unsigned int n_0,
unsigned int n_max,
const Tr *,
156 #ifdef CYCLE_PROFILING 157 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
163 c_ptr + m_0 * ldc + n_0, ldc,
165 m_max-m_0, n_max - n_0, kern_k,
167 col_bias + n_0, qp, n_0,
accumulate, acc_buff);
172 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
174 #ifdef CYCLE_PROFILING
177 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *c_panel,
178 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
179 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *,
180 const Activation &,
bool,
const Requantize32 &qp,
const int32_t *col_bias,
183 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
186 #ifdef CYCLE_PROFILING 187 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
190 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
194 #ifdef CYCLE_PROFILING 195 auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() *
sizeof(Tr)));
201 for (
int i=0; i<bblocks; i++) {
202 unsigned int n_start = n_0 + (strategy::out_width() * i);
203 unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
206 const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
209 c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
210 c_ptr + m_0 * ldc + n_start, ldc,
211 row_bias, col_bias + n_start, n_start);
224 template<
typename strategy,
bool quantized>
225 class transform_type {
227 typedef decltype(strategy::transforms)
type;
230 template<
typename strategy>
231 class transform_type<strategy, true> {
233 typedef decltype(strategy::transforms_quantized)
type;
237 template<
typename strategy,
typename OutputStage>
238 class accumulate_buffer_type {
240 typedef typename strategy::result_type
type;
243 template<
typename strategy>
244 class accumulate_buffer_type<strategy, Requantize32> {
246 typedef int32_t
type;
251 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing,
bool MergeStep=true,
bool ForceThreadColumns=false>
253 typedef typename strategy::operand_type Toi;
254 typedef typename strategy::result_type Tri;
255 typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
258 const CPUInfo *
const _ci;
260 const unsigned int _Msize;
261 const unsigned int _Nsize;
262 const unsigned int _Ksize;
263 const unsigned int _Ksections;
264 const unsigned int _Ktotal;
265 const unsigned int _rounded_Ksize;
267 const unsigned int _nbatches;
268 const unsigned int _nmulti;
270 const bool _thread_columns;
274 const int _maxthreads;
278 unsigned int _k_block=0;
279 unsigned int _x_block=0;
280 unsigned int _Mround=0;
283 const Toi *_B_transposed=
nullptr;
284 void *_working_space=
nullptr;
286 Tab *_accumulation_buffer=
nullptr;
292 int32_t *col_bias =
nullptr;
295 const To *
const *
const * _indirect_buf =
nullptr;
298 std::unique_ptr<convolver<To>> _convolver =
nullptr;
300 unsigned int get_col_sum_size()
const {
301 if (std::is_same<OutputStage, Requantize32>::value) {
302 return _Nsize * _nmulti *
sizeof(int32_t);
316 unsigned int _k0=0, _x0=0, _multi=0;
319 unsigned int _x_start=0;
320 unsigned int _x_end=_parent._Nsize;
322 unsigned int _index=0;
324 bool _newkblock=
true;
333 unsigned int xmax() {
334 return std::min(_x0 + _parent._x_block, _x_end);
337 unsigned int kmax() {
338 return std::min(_k0 + _parent._k_block, _parent._Ktotal);
348 _x0 += _parent._x_block;
351 _k0 += _parent._k_block;
352 if (_k0 >= _parent._Ktotal) {
355 if (_multi >= _parent._nmulti) {
368 unsigned int k0(
void) {
return _k0; }
369 unsigned int x0(
void) {
return _x0; }
370 unsigned int multi(
void) {
return _multi; }
371 unsigned int index(
void) {
return _index; }
372 bool done(
void) {
return _done; }
373 bool newkblock(
void) {
return _newkblock; }
380 unsigned int get_total_k_depth()
const {
381 unsigned int k_depth = _k_block;
383 if (std::is_same<OutputStage, Requantize32>::value) {
384 k_depth +=
sizeof(int32_t) /
sizeof(Toi);
391 size_t get_a_working_size()
const {
392 if (_thread_columns) {
394 return ROUND_UP(
sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
397 return ROUND_UP(
sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
402 size_t get_c_working_size()
const {
404 return ROUND_UP(
sizeof(Tri) * _x_block * strategy::out_height());
411 size_t get_accumulation_buffer_size()
const {
418 if (_k_block == _Ktotal) {
423 size_t size_per_buffer =
sizeof(Tab) * strategy::out_height() * strategy::out_width();
424 size_t num_buffers =
iceildiv(_Msize, strategy::out_height()) *
iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
426 return num_buffers * size_per_buffer;
430 Tab *get_accumulation_buffer(
unsigned int M,
unsigned int N,
unsigned int batch,
unsigned int multi)
const {
432 if (_accumulation_buffer ==
nullptr) {
437 size_t size_per_buffer = strategy::out_height() * strategy::out_width();
439 size_t buffer_rows =
iceildiv(_Msize, strategy::out_height());
440 size_t buffer_cols =
iceildiv(_Nsize, strategy::out_width());
441 size_t buffers_per_batch = (buffer_rows * buffer_cols);
442 size_t buffers_per_multi = buffers_per_batch * _nbatches;
445 size_t row =
M / strategy::out_height();
446 assert(
M % strategy::out_height() == 0);
447 size_t col =
N / strategy::out_width();
448 assert(
N % strategy::out_width() == 0);
450 size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
452 return _accumulation_buffer + (buffer_index * size_per_buffer);
455 int32_t row_sum_multiplier()
const {
456 if (std::is_same<OutputStage, Requantize32>::value) {
457 const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
459 return -qp->b_offset;
466 static bool is_thread_columns(
const GemmArgs &
args) {
468 if (ForceThreadColumns) {
473 if (
args._maxthreads == 1) {
478 int m_blocks =
iceildiv(
args._Msize, strategy::out_height()) *
args._nbatches;
481 if (
args._maxthreads > m_blocks) {
486 if (((
roundup(m_blocks,
args._maxthreads) * 100) / m_blocks) > 120) {
493 static unsigned int get_ktotal(
const GemmArgs &
args) {
497 static unsigned int get_k_block_size(
const GemmArgs &
args) {
498 if (
args._cfg &&
args._cfg->inner_block_size) {
499 return args._cfg->inner_block_size;
503 if (std::is_same<OutputStage, Requantize32>::value) {
504 return get_ktotal(
args);
507 const unsigned int L1_size =
args._ci->get_L1_cache_size();
508 unsigned int k_block;
512 k_block = (L1_size / 2) / (
sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
515 k_block /= strategy::k_unroll();
516 k_block = std::max(k_block, 1U) * strategy::k_unroll();
519 unsigned int num_k_blocks =
iceildiv(get_ktotal(
args), k_block);
525 k_block =
roundup(k_block, strategy::k_unroll());
532 static unsigned int get_x_block_size(
const GemmArgs &
args) {
533 if (is_thread_columns(
args)) {
535 return roundup(
args._Nsize, strategy::out_width());
538 if (
args._cfg &&
args._cfg->outer_block_size) {
539 return roundup(
args._cfg->outer_block_size, strategy::out_width());
542 unsigned int x_block;
543 const unsigned int L2_size =
args._ci->get_L2_cache_size();
544 const unsigned int k_block = get_k_block_size(
args);
548 const unsigned int scaled_l2_size = (L2_size * 9) / 10;
549 const unsigned int k_block_area = k_block *
sizeof(Toi) * (strategy::out_width() + strategy::out_height());
552 if (k_block_area > scaled_l2_size) {
553 return strategy::out_width();
556 x_block = (scaled_l2_size - k_block_area) / (
sizeof(Toi) * k_block);
559 x_block /= strategy::out_width();
560 x_block = std::max(x_block, 1u) * strategy::out_width();
563 unsigned int num_x_blocks =
iceildiv(
args._Nsize, x_block);
566 x_block =
roundup(x_block, strategy::out_width());
579 : _ci(
args._ci), _Msize(
args._Msize), _Nsize(
args._Nsize), _Ksize(
args._Ksize),
580 _Ksections(
args._Ksections), _Ktotal(get_ktotal(
args)),
581 _rounded_Ksize(
roundup(_Ksize, strategy::k_unroll())),
582 _nbatches(
args._nbatches), _nmulti(
args._nmulti), _thread_columns(is_thread_columns(
args)),
583 _act(
args._act), _maxthreads(
args._maxthreads), _nthreads(
args._maxthreads),
584 _k_block(get_k_block_size(
args)), _x_block(get_x_block_size(
args)), _Mround(
roundup(
args._Msize, strategy::out_height())),
589 : _ci(
args._ci), _Msize(
args._Msize), _Nsize(
args._Nsize), _Ksize(
args._Ksize),
590 _Ksections(
args._Ksections), _Ktotal(get_ktotal(
args)),
591 _rounded_Ksize(
roundup(_Ksize, strategy::k_unroll())),
592 _nbatches(
args._nbatches), _nmulti(
args._nmulti), _thread_columns(is_thread_columns(
args)),
593 _act(
args._act), _maxthreads(
args._maxthreads), _nthreads(
args._maxthreads),
594 _k_block(get_k_block_size(
args)), _x_block(get_x_block_size(
args)), _Mround(
roundup(
args._Msize, strategy::out_height())),
604 unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
606 if (_thread_columns) {
607 return { row_blocks,
iceildiv(_Nsize, strategy::out_width()) };
610 return { row_blocks };
616 _nthreads = std::min(nthreads, _maxthreads);
620 void execute(
const ndcoord_t &work_range,
const ndcoord_t &,
int threadid)
override {
621 #ifdef CYCLE_PROFILING 626 assert(_B_transposed);
627 assert(_working_space);
628 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
631 intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
632 if (working_space_v & 0x3f) {
633 intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
634 working_space_bytes += alignment_offset;
639 const auto start = work_range.get_position(0);
640 const auto end = work_range.get_position_end(0);
643 const unsigned int window_per_batch = _Mround / strategy::out_height();
644 unsigned int batch_0 = start / window_per_batch;
645 unsigned int batch_end =
end / window_per_batch;
649 if (_thread_columns) {
650 const auto start_x = work_range.get_position(1) * strategy::out_width();
651 const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
653 Tri *
const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
654 Toi *
const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
655 (threadid *
sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
657 for (
unsigned int multi=0; multi<_nmulti; multi++) {
658 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
659 unsigned int kmax=std::min(k0+_k_block, _Ktotal);
661 unsigned int rounded_width =
roundup(_Nsize, strategy::out_width());
663 const bool first_pass = (k0==0);
664 const bool last_pass = (kmax==_Ktotal);
667 unsigned int kern_k =
roundup(kmax - k0, strategy::k_unroll());
669 const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
671 unsigned int batch = batch_0;
672 unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
674 for (
unsigned int p=start; p<
end; p++) {
675 unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
679 #ifdef CYCLE_PROFILING 680 auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) *
sizeof(Toi));
684 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
686 if (_indirect_buf !=
nullptr) {
687 transforms.PrepareA_indirect(a_panel,
688 _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
689 _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
690 }
else if (_convolver) {
691 transforms.PrepareA_convolution(a_panel,
692 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
693 this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
695 transforms.PrepareA(a_panel,
696 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
697 this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
703 #ifdef CYCLE_PROFILING
707 strat, a_panel, b_ptr, c_panel,
709 this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
711 kern_k, start_row, end_row, start_x, end_x,
713 ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) :
nullptr),
715 (last_pass ? _act :
Activation()), !first_pass,
717 _os, col_bias + (multi * _Nsize),
719 static_cast<Tab *>(
nullptr));
722 start_row += strategy::out_height();
723 if (start_row >= _Msize) {
731 blockwalker current(*
this);
734 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
735 unsigned int m_max = (
end - (batch_end * window_per_batch)) * strategy::out_height();
741 Toi *
const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
742 Tri *
const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
745 b_panel = _B_transposed;
757 unsigned int kern_k = 0;
758 unsigned int a_panel_stride = 0;
760 for (;!current.done();current.advance()) {
761 if (current.newkblock()) {
762 #ifdef CYCLE_PROFILING 763 auto p=prof.ScopedProfiler(PROFILE_PREPA, (
end - start) * strategy::out_height() * (current.kmax()-current.k0()) *
sizeof(Toi));
767 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
769 for (
unsigned int batch = batch_0; batch <= batch_end; batch++) {
770 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
771 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
773 if (first_m >= last_m)
776 if (_indirect_buf !=
nullptr) {
777 transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
778 _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
779 _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
780 }
else if (_convolver) {
781 transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
782 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
783 this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
785 transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
786 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
787 this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
792 kern_k =
roundup(current.kmax() - current.k0(), strategy::k_unroll());
798 if(std::is_same<OutputStage, Requantize32>::value) {
799 a_panel_stride = kern_k + (
sizeof(int32_t) /
sizeof(Toi));
801 a_panel_stride = kern_k;
806 for (
unsigned int batch = batch_0; batch <= batch_end; batch++) {
807 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
808 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
810 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
812 if (first_m >= last_m)
819 unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
823 if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
824 m_step = strategy::out_height();
827 for (
unsigned int y=first_m; y<last_m; y+=m_step) {
828 unsigned int ymax = std::min(_Msize, y + m_step);
830 const bool first_pass = (current.k0() == 0);
831 const bool last_pass = (current.kmax() == _Ktotal);
834 Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
838 if (_accumulation_buffer && !last_pass) {
839 result_ptr =
nullptr;
844 #ifdef CYCLE_PROFILING
848 strat, a_ptr, b_panel, c_panel,
850 result_ptr, this->_ldc,
852 kern_k, y, ymax, current.x0(), current.xmax(),
854 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) :
nullptr),
856 (last_pass ? _act :
Activation()), !first_pass,
858 _os, col_bias + (current.multi() * _Nsize),
860 get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
862 a_ptr += (strategy::out_height() * a_panel_stride);
866 b_panel += (
roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
874 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
883 int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
884 intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
888 if (working_space_int & 0x3F) {
889 diff = 0x40 - (working_space_int & 0x3F);
892 working_space_bytes += diff;
893 working_space_int += diff;
896 _working_space = reinterpret_cast<void *>(working_space_bytes);
899 if (get_accumulation_buffer_size() > 0) {
900 intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
902 if (acc_buff_int & 0x3F) {
903 acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
905 _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
907 _accumulation_buffer =
nullptr;
917 return (_B_transposed==
nullptr);
921 unsigned int x_size =
roundup(_Nsize, strategy::out_width());
923 return (x_size * _Ktotal * _nmulti *
sizeof(Toi)) + get_col_sum_size();
927 if (std::is_same<OutputStage, Requantize32>::value) {
928 col_bias = reinterpret_cast<int32_t *>(in_buffer);
930 Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
932 for (
unsigned int i=0; i<_nmulti; i++) {
934 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
939 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
940 Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
941 _B_transposed = buffer;
943 blockwalker current(*
this);
948 unsigned int k_size = (current.kmax() - current.k0());
957 const unsigned int rounded_section_size =
roundup(_Ksize, strategy::k_unroll());
962 for (
unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
963 unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
966 unsigned int kpos = current.k0();
967 unsigned int kleft = k_size;
971 unsigned int k_section_base = kpos / rounded_section_size;
973 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
976 unsigned int k_length = std::min(_Ksize - k_offset, kleft);
978 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
980 (k_section_base * _Ksize) + k_offset,
981 (k_section_base * _Ksize) + k_offset + k_length);
984 unsigned int padded_length =
roundup(k_length, strategy::k_unroll());
986 buffer += strategy::out_width() * padded_length;
988 kpos += padded_length;
989 kleft -= padded_length;
992 }
while (current.advance());
997 uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
998 _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
999 col_bias = reinterpret_cast<int32_t *>(in_buffer);
1003 if (std::is_same<OutputStage, Requantize32>::value) {
1004 Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
1007 qp->bias_multi_stride = bias_multi_stride;
1012 assert(string_len == _Ksize);
1013 _indirect_buf = ptr;
1017 assert(parms.input_channels == _Ksize);
1018 _convolver = std::unique_ptr<convolver<To>>(
new convolver<To>(parms));
1025 uint64_t total_macs = static_cast<uint64_t>(
args._nbatches) *
args._nmulti *
roundup(
args._Msize, strategy::out_height()) *
roundup(
args._Nsize, strategy::out_width()) * get_ktotal(
args);
1026 uint64_t prepare_bytes = static_cast<uint64_t>(
args._nbatches) *
args._nmulti *
roundup(
args._Msize, strategy::out_height()) * get_ktotal(
args) *
sizeof(Toi);
1027 uint64_t merge_bytes = static_cast<uint16_t>(
args._nbatches) *
args._nmulti * k_blocks *
roundup(
args._Msize, strategy::out_height()) *
roundup(
args._Nsize, strategy::out_width()) *
sizeof(Tr);
1031 float merge_cycles = static_cast<float>(merge_bytes) / params.
merge_bytes_cycle;
1033 float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
1037 float parallelism_available = static_cast<float>(
iceildiv(
args._Msize, strategy::out_height()) *
args._nbatches) * 0.9f;
1039 if (parallelism_available <
args._maxthreads) {
1040 total_cycles *= (static_cast<float>(
args._maxthreads) / parallelism_available);
1043 return static_cast<uint64_t>(total_cycles);
1048 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
1051 template<
typename strategy,
typename To,
typename Tr>
1054 template<
typename strategy,
typename To,
typename Tr>
T roundup(const T a, const T b)
ndrange_t get_window_size() const override
GemmInterleaved(const GemmArgs &args, const OutputStage &os)
T iceildiv(const T a, const T b)
void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
void set_indirect_parameters(size_t string_len, const To *const *const *ptr) override
GemmInterleaved & operator=(GemmInterleaved &)=delete
decltype(strategy::transforms) typedef type
SimpleTensor< T2 > accumulate(const SimpleTensor< T1 > &src, DataType output_data_type)
size_t get_B_pretransposed_array_size() const override
void advance(CharPosition &pos, char ch)
GemmInterleaved(const GemmArgs &args)
size_t get_working_size() const override
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
void end(TokenStream &in, bool &valid)
void set_pretransposed_B_data(void *in_buffer) override
void set_convolution_parameters(ConvolutionParameters parms) override
void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
bool B_pretranspose_required() const override
GemmInterleaved(GemmInterleaved &)=delete
void set_nthreads(int nthreads) override
void set_working_space(void *working_space) override
void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
bool B_is_pretransposed() const override
static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms)
void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override