37 #ifdef CYCLE_PROFILING 38 #include "profiler.hpp" 43 #define ALLOC_ROUND 64 44 #define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) 59 template<
bool MergeStep,
typename OutputStage>
60 class kernel_and_merge {
62 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
64 #ifdef CYCLE_PROFILING
67 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *c_panel,
68 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
69 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
76 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
78 #ifdef CYCLE_PROFILING
81 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *c_panel,
82 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
83 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
86 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
89 #ifdef CYCLE_PROFILING 90 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
93 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
97 #ifdef CYCLE_PROFILING 98 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() *
sizeof(Tr)));
100 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
106 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
108 #ifdef CYCLE_PROFILING
111 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *,
112 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
unsigned int m_max,
113 unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
114 const Activation &act,
bool accumulate,
const Nothing &,
const int32_t *,
117 #ifdef CYCLE_PROFILING 118 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
125 if (c_ptr ==
nullptr) {
126 offset_c_ptr =
nullptr;
128 offset_c_ptr = c_ptr + m_0 * ldc + n_0;
136 m_max-m_0, n_max - n_0, kern_k,
138 biasptr ? biasptr + n_0 :
nullptr, act, accumulate,
145 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
147 #ifdef CYCLE_PROFILING
150 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *,
151 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
unsigned int m_max,
152 unsigned int n_0,
unsigned int n_max,
const Tr *,
153 const Activation &,
bool accumulate,
const Requantize32 &qp,
const int32_t *col_bias,
156 #ifdef CYCLE_PROFILING 157 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
163 c_ptr + m_0 * ldc + n_0, ldc,
165 m_max-m_0, n_max - n_0, kern_k,
167 col_bias + n_0, qp, n_0, accumulate, acc_buff);
172 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
174 #ifdef CYCLE_PROFILING
177 strategy &strat,
const To *a_ptr,
const To *b_panel, Tri *c_panel,
178 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
179 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *,
180 const Activation &,
bool,
const Requantize32 &qp,
const int32_t *col_bias,
183 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
186 #ifdef CYCLE_PROFILING 187 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
190 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
194 #ifdef CYCLE_PROFILING 195 auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() *
sizeof(Tr)));
201 for (
int i=0; i<bblocks; i++) {
202 unsigned int n_start = n_0 + (strategy::out_width() * i);
203 unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
206 const int32_t *row_bias =
reinterpret_cast<const int32_t *
>(a_ptr + strategy::out_height() * kern_k);
209 c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
210 c_ptr + m_0 * ldc + n_start, ldc,
211 row_bias, col_bias + n_start, n_start);
224 template<
typename strategy,
bool quantized>
225 class transform_type {
227 typedef decltype(strategy::transforms)
type;
230 template<
typename strategy>
231 class transform_type<strategy, true> {
233 typedef decltype(strategy::transforms_quantized)
type;
237 template<
typename strategy,
typename OutputStage>
238 class accumulate_buffer_type {
240 typedef typename strategy::result_type
type;
243 template<
typename strategy>
244 class accumulate_buffer_type<strategy, Requantize32> {
246 typedef int32_t
type;
251 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing,
bool MergeStep=true,
bool ForceThreadColumns=false>
253 typedef typename strategy::operand_type Toi;
254 typedef typename strategy::result_type Tri;
255 typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
260 const unsigned int _Msize;
261 const unsigned int _Nsize;
262 const unsigned int _Ksize;
263 const unsigned int _Ksections;
264 const unsigned int _Ktotal;
265 const unsigned int _rounded_Ksize;
267 const unsigned int _nbatches;
268 const unsigned int _nmulti;
270 const bool _thread_columns;
274 const int _maxthreads;
278 unsigned int _k_block=0;
279 unsigned int _x_block=0;
280 unsigned int _Mround=0;
283 const Toi *_B_transposed=
nullptr;
284 void *_working_space=
nullptr;
286 Tab *_accumulation_buffer=
nullptr;
292 int32_t *col_bias =
nullptr;
295 const To *
const *
const * _indirect_buf =
nullptr;
298 std::unique_ptr<convolver<To>> _convolver =
nullptr;
300 unsigned int get_col_sum_size()
const {
301 if (std::is_same<OutputStage, Requantize32>::value) {
302 return _Nsize * _nmulti *
sizeof(int32_t);
316 unsigned int _k0=0, _x0=0, _multi=0;
319 unsigned int _x_start=0;
320 unsigned int _x_end=_parent._Nsize;
322 unsigned int _index=0;
324 bool _newkblock=
true;
331 unsigned int x_start,
unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
333 unsigned int xmax() {
334 return std::min(_x0 + _parent._x_block, _x_end);
337 unsigned int kmax() {
338 return std::min(_k0 + _parent._k_block, _parent._Ktotal);
348 _x0 += _parent._x_block;
351 _k0 += _parent._k_block;
352 if (_k0 >= _parent._Ktotal) {
355 if (_multi >= _parent._nmulti) {
368 unsigned int k0(
void) {
return _k0; }
369 unsigned int x0(
void) {
return _x0; }
370 unsigned int multi(
void) {
return _multi; }
371 unsigned int index(
void) {
return _index; }
372 bool done(
void) {
return _done; }
373 bool newkblock(
void) {
return _newkblock; }
380 unsigned int get_total_k_depth()
const {
381 unsigned int k_depth = _k_block;
383 if (std::is_same<OutputStage, Requantize32>::value) {
384 k_depth +=
sizeof(int32_t) /
sizeof(Toi);
391 size_t get_a_working_size()
const {
392 if (_thread_columns) {
394 return ROUND_UP(
sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
397 return ROUND_UP(
sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
402 size_t get_c_working_size()
const {
404 return ROUND_UP(
sizeof(Tri) * _x_block * strategy::out_height());
411 size_t get_accumulation_buffer_size()
const {
418 if (_k_block == _Ktotal) {
423 size_t size_per_buffer =
sizeof(Tab) * strategy::out_height() * strategy::out_width();
424 size_t num_buffers =
iceildiv(_Msize, strategy::out_height()) *
iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
426 return num_buffers * size_per_buffer;
430 Tab *get_accumulation_buffer(
unsigned int M,
unsigned int N,
unsigned int batch,
unsigned int multi)
const {
432 if (_accumulation_buffer ==
nullptr) {
437 size_t size_per_buffer = strategy::out_height() * strategy::out_width();
439 size_t buffer_rows =
iceildiv(_Msize, strategy::out_height());
440 size_t buffer_cols =
iceildiv(_Nsize, strategy::out_width());
441 size_t buffers_per_batch = (buffer_rows * buffer_cols);
442 size_t buffers_per_multi = buffers_per_batch * _nbatches;
445 size_t row = M / strategy::out_height();
446 assert(M % strategy::out_height() == 0);
447 size_t col = N / strategy::out_width();
448 assert(N % strategy::out_width() == 0);
450 size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
452 return _accumulation_buffer + (buffer_index * size_per_buffer);
455 int32_t row_sum_multiplier()
const {
456 if (std::is_same<OutputStage, Requantize32>::value) {
468 if (ForceThreadColumns) {
493 static unsigned int get_ktotal(
const GemmArgs &args) {
497 static unsigned int get_k_block_size(
const GemmArgs &args) {
503 if (std::is_same<OutputStage, Requantize32>::value) {
504 return get_ktotal(args);
508 unsigned int k_block;
512 k_block = (L1_size / 2) / (
sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
515 k_block /= strategy::k_unroll();
516 k_block = std::max(k_block, 1U) * strategy::k_unroll();
519 unsigned int num_k_blocks =
iceildiv(get_ktotal(args), k_block);
522 k_block =
iceildiv(get_ktotal(args), num_k_blocks);
525 k_block =
roundup(k_block, strategy::k_unroll());
532 static unsigned int get_x_block_size(
const GemmArgs &args) {
533 if (is_thread_columns(args)) {
542 unsigned int x_block;
544 const unsigned int k_block = get_k_block_size(args);
548 const unsigned int scaled_l2_size = (L2_size * 9) / 10;
549 const unsigned int k_block_area = k_block *
sizeof(Toi) * (strategy::out_width() + strategy::out_height());
552 if (k_block_area > scaled_l2_size) {
553 return strategy::out_width();
556 x_block = (scaled_l2_size - k_block_area) / (
sizeof(Toi) * k_block);
559 x_block /= strategy::out_width();
560 x_block = std::max(x_block, 1u) * strategy::out_width();
566 x_block =
roundup(x_block, strategy::out_width());
579 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
580 _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
581 _rounded_Ksize(
roundup(_Ksize, strategy::k_unroll())),
582 _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
583 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
584 _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(
roundup(args._Msize, strategy::out_height())),
589 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
590 _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
591 _rounded_Ksize(
roundup(_Ksize, strategy::k_unroll())),
592 _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
593 _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
594 _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(
roundup(args._Msize, strategy::out_height())),
604 unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
606 if (_thread_columns) {
607 return { row_blocks,
iceildiv(_Nsize, strategy::out_width()) };
610 return { row_blocks };
616 _nthreads = std::min(nthreads, _maxthreads);
621 #ifdef CYCLE_PROFILING 626 assert(_B_transposed);
627 assert(_working_space);
628 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(_working_space);
631 intptr_t working_space_v =
reinterpret_cast<intptr_t
>(_working_space);
632 if (working_space_v & 0x3f) {
633 intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
634 working_space_bytes += alignment_offset;
643 const unsigned int window_per_batch = _Mround / strategy::out_height();
644 unsigned int batch_0 = start / window_per_batch;
645 unsigned int batch_end =
end / window_per_batch;
649 if (_thread_columns) {
650 const auto start_x = work_range.
get_position(1) * strategy::out_width();
651 const auto end_x = std::min(work_range.
get_position_end(1) * strategy::out_width(), _Nsize);
653 Tri *
const c_panel =
reinterpret_cast<Tri *
>(working_space_bytes + (threadid * get_c_working_size()));
654 Toi *
const a_panel =
reinterpret_cast<Toi *
>(working_space_bytes + (_maxthreads * get_c_working_size()) +
655 (threadid *
sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
657 for (
unsigned int multi=0; multi<_nmulti; multi++) {
658 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
659 unsigned int kmax=std::min(k0+_k_block, _Ktotal);
661 unsigned int rounded_width =
roundup(_Nsize, strategy::out_width());
663 const bool first_pass = (k0==0);
664 const bool last_pass = (kmax==_Ktotal);
667 unsigned int kern_k =
roundup(kmax - k0, strategy::k_unroll());
669 const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
671 unsigned int batch = batch_0;
672 unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
674 for (
unsigned int p=start; p<
end; p++) {
675 unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
679 #ifdef CYCLE_PROFILING 680 auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) *
sizeof(Toi));
684 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
686 if (_indirect_buf !=
nullptr) {
687 transforms.PrepareA_indirect(a_panel,
688 _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
689 _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
690 }
else if (_convolver) {
691 transforms.PrepareA_convolution(a_panel,
692 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
693 this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
695 transforms.PrepareA(a_panel,
696 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
697 this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
703 #ifdef CYCLE_PROFILING
707 strat, a_panel, b_ptr, c_panel,
709 this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
711 kern_k, start_row, end_row, start_x, end_x,
713 ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) :
nullptr),
715 (last_pass ? _act :
Activation()), !first_pass,
717 _os, col_bias + (multi * _Nsize),
719 static_cast<Tab *>(
nullptr));
722 start_row += strategy::out_height();
723 if (start_row >= _Msize) {
731 blockwalker current(*
this);
734 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
735 unsigned int m_max = (
end - (batch_end * window_per_batch)) * strategy::out_height();
741 Toi *
const a_panel =
reinterpret_cast<Toi *
>(working_space_bytes + (_maxthreads * get_c_working_size()));
742 Tri *
const c_panel =
reinterpret_cast<Tri *
>(working_space_bytes + (threadid * get_c_working_size()));
745 b_panel = _B_transposed;
757 unsigned int kern_k = 0;
758 unsigned int a_panel_stride = 0;
760 for (;!current.done();current.advance()) {
761 if (current.newkblock()) {
762 #ifdef CYCLE_PROFILING 763 auto p=prof.ScopedProfiler(PROFILE_PREPA, (
end - start) * strategy::out_height() * (current.kmax()-current.k0()) *
sizeof(Toi));
767 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
769 for (
unsigned int batch = batch_0; batch <= batch_end; batch++) {
770 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
771 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
773 if (first_m >= last_m)
776 if (_indirect_buf !=
nullptr) {
777 transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
778 _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
779 _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
780 }
else if (_convolver) {
781 transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
782 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
783 this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
785 transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
786 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
787 this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
792 kern_k =
roundup(current.kmax() - current.k0(), strategy::k_unroll());
798 if(std::is_same<OutputStage, Requantize32>::value) {
799 a_panel_stride = kern_k + (
sizeof(int32_t) /
sizeof(Toi));
801 a_panel_stride = kern_k;
806 for (
unsigned int batch = batch_0; batch <= batch_end; batch++) {
807 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
808 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
810 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
812 if (first_m >= last_m)
819 unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
823 if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
824 m_step = strategy::out_height();
827 for (
unsigned int y=first_m; y<last_m; y+=m_step) {
828 unsigned int ymax = std::min(_Msize, y + m_step);
830 const bool first_pass = (current.k0() == 0);
831 const bool last_pass = (current.kmax() == _Ktotal);
834 Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
838 if (_accumulation_buffer && !last_pass) {
839 result_ptr =
nullptr;
844 #ifdef CYCLE_PROFILING
848 strat, a_ptr, b_panel, c_panel,
850 result_ptr, this->_ldc,
852 kern_k, y, ymax, current.x0(), current.xmax(),
854 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) :
nullptr),
856 (last_pass ? _act :
Activation()), !first_pass,
858 _os, col_bias + (current.multi() * _Nsize),
860 get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
862 a_ptr += (strategy::out_height() * a_panel_stride);
866 b_panel += (
roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
874 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
883 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(working_space);
884 intptr_t working_space_int =
reinterpret_cast<intptr_t
>(working_space);
888 if (working_space_int & 0x3F) {
889 diff = 0x40 - (working_space_int & 0x3F);
892 working_space_bytes += diff;
893 working_space_int += diff;
896 _working_space =
reinterpret_cast<void *
>(working_space_bytes);
899 if (get_accumulation_buffer_size() > 0) {
900 intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
902 if (acc_buff_int & 0x3F) {
903 acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
905 _accumulation_buffer =
reinterpret_cast<Tab *
>(acc_buff_int);
907 _accumulation_buffer =
nullptr;
917 return (_B_transposed==
nullptr);
921 unsigned int x_size =
roundup(_Nsize, strategy::out_width());
923 return (x_size * _Ktotal * _nmulti *
sizeof(Toi)) + get_col_sum_size();
927 if (std::is_same<OutputStage, Requantize32>::value) {
928 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
932 for (
unsigned int i=0; i<_nmulti; i++) {
934 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
939 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
940 Toi *buffer =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
941 _B_transposed = buffer;
943 blockwalker current(*
this);
948 unsigned int k_size = (current.kmax() - current.k0());
950 if (_Ksections > 1) {
958 const unsigned int rounded_section_size =
roundup(_Ksize, strategy::k_unroll());
963 for (
unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) {
964 unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
967 unsigned int kpos = current.k0();
968 unsigned int kleft = k_size;
972 unsigned int k_section_base = kpos / rounded_section_size;
974 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
977 unsigned int k_length = std::min(_Ksize - k_offset, kleft);
979 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
981 (k_section_base * _Ksize) + k_offset,
982 (k_section_base * _Ksize) + k_offset + k_length);
985 unsigned int padded_length =
roundup(k_length, strategy::k_unroll());
987 buffer += strategy::out_width() * padded_length;
989 kpos += padded_length;
990 kleft -= padded_length;
996 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
997 current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize));
998 buffer +=
roundup(current.xmax() - current.x0(), strategy::out_width()) *
roundup(current.kmax() - current.k0(), strategy::k_unroll());
1000 }
while (current.advance());
1005 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
1006 _B_transposed =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
1007 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
1011 if (std::is_same<OutputStage, Requantize32>::value) {
1020 assert(string_len == _Ksize);
1021 _indirect_buf = ptr;
1026 _convolver = std::unique_ptr<convolver<To>>(
new convolver<To>(parms));
1030 template<
typename perf_type>
1032 unsigned int k_blocks =
iceildiv(args.
_Ksize, get_k_block_size(args));
1037 uint64_t prepare_bytes =
static_cast<uint64_t
>(args.
_nbatches) * args.
_nmulti *
roundup(args.
_Msize, strategy::out_height()) * get_ktotal(args) *
sizeof(Toi);
1042 float merge_cycles =
static_cast<float>(merge_bytes) / params.
merge_bytes_cycle;
1044 float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
1048 float parallelism_available = static_cast<float>(
iceildiv(args.
_Msize, strategy::out_height()) * args.
_nbatches) * 0.9f;
1051 total_cycles *= (
static_cast<float>(args.
_maxthreads) / parallelism_available);
1054 return static_cast<uint64_t
>(total_cycles);
1063 c.
filter = get_type_name<strategy>();
1070 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
1073 template<
typename strategy,
typename To,
typename Tr>
1076 template<
typename strategy,
typename To,
typename Tr>
T roundup(const T a, const T b)
ndrange_t get_window_size() const override
static uint64_t estimate_cycles(const GemmArgs &args)
GemmInterleaved(const GemmArgs &args, const OutputStage &os)
T iceildiv(const T a, const T b)
void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
void set_indirect_parameters(size_t string_len, const To *const *const *ptr) override
decltype(strategy::transforms) typedef type
SimpleTensor< T2 > accumulate(const SimpleTensor< T1 > &src, DataType output_data_type)
unsigned int inner_block_size
unsigned int outer_block_size
size_t get_B_pretransposed_array_size() const override
int_t get_position(int_t d) const
void advance(CharPosition &pos, char ch)
GemmInterleaved(const GemmArgs &args)
size_t get_working_size() const override
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
void end(TokenStream &in, bool &valid)
void set_pretransposed_B_data(void *in_buffer) override
void set_convolution_parameters(ConvolutionParameters parms) override
unsigned int get_L1_cache_size() const
Gets the L1 cache size.
unsigned int get_L2_cache_size() const
Gets the L2 cache size.
void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
bool B_pretranspose_required() const override
void set_nthreads(int nthreads) override
void set_working_space(void *working_space) override
NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it in...
void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
bool B_is_pretransposed() const override
int_t get_position_end(int_t d) const
void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override
Main execute member fucntion.
GemmConfig get_config() override