41 #ifdef CYCLE_PROFILING
42 #include "profiler.hpp"
47 #define ALLOC_ROUND 64
48 #define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
63 template<
bool MergeStep,
bool FixedFormat,
typename OutputStage>
64 class kernel_and_merge {
66 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
68 #ifdef CYCLE_PROFILING
71 strategy &strat,
const To *a_ptr,
const To *b_panel,
size_t b_stride, Tri *c_panel,
72 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
73 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
74 const Activation &act,
bool accumulate,
const OutputStage &os,
const int32_t *col_bias,
80 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
82 #ifdef CYCLE_PROFILING
85 strategy &strat,
const To *a_ptr,
const To *b_panel,
size_t, Tri *c_panel,
86 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
87 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
88 const Activation &act,
bool accumulate,
const Nothing &,
const int32_t *, Tab *)
90 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
93 #ifdef CYCLE_PROFILING
94 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
97 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
101 #ifdef CYCLE_PROFILING
102 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() *
sizeof(Tr)));
104 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act,
accumulate);
110 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
112 #ifdef CYCLE_PROFILING
115 strategy &strat,
const To *a_ptr,
const To *b_panel,
size_t b_stride, Tri *c_panel,
116 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
117 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
118 const Activation &act,
bool accumulate,
const Nothing &,
const int32_t *, Tab *)
121 #ifdef CYCLE_PROFILING
122 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
123 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
126 strat.kernel(a_ptr, b_panel, b_stride, c_panel, 1, (n_max - n_0), kern_k);
130 #ifdef CYCLE_PROFILING
131 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
132 auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() *
sizeof(Tr)));
134 strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act,
accumulate);
140 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
142 #ifdef CYCLE_PROFILING
145 strategy &strat,
const To *a_ptr,
const To *b_panel,
size_t, Tri *,
146 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
unsigned int m_max,
147 unsigned int n_0,
unsigned int n_max,
const Tr *biasptr,
148 const Activation &act,
bool accumulate,
const Nothing &,
const int32_t *,
151 #ifdef CYCLE_PROFILING
152 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
159 if (c_ptr ==
nullptr) {
160 offset_c_ptr =
nullptr;
162 offset_c_ptr = c_ptr + m_0 * ldc + n_0;
170 m_max-m_0, n_max - n_0, kern_k,
172 biasptr ? biasptr + n_0 :
nullptr, act,
accumulate,
179 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
181 #ifdef CYCLE_PROFILING
184 strategy &strat,
const To *a_ptr,
const To *b_panel,
size_t, Tri *,
185 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
unsigned int m_max,
186 unsigned int n_0,
unsigned int n_max,
const Tr *,
187 const Activation &,
bool accumulate,
const Requantize32 &qp,
const int32_t *col_bias,
190 #ifdef CYCLE_PROFILING
191 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
197 c_ptr + m_0 * ldc + n_0, ldc,
199 m_max-m_0, n_max - n_0, kern_k,
201 col_bias + n_0, qp, n_0,
accumulate, acc_buff);
206 template<
typename strategy,
typename To,
typename Tr,
typename Tri,
typename Tab>
208 #ifdef CYCLE_PROFILING
211 strategy &strat,
const To *a_ptr,
const To *b_panel,
size_t, Tri *c_panel,
212 Tr *c_ptr,
int ldc,
int kern_k,
unsigned int m_0,
213 unsigned int m_max,
unsigned int n_0,
unsigned int n_max,
const Tr *,
214 const Activation &,
bool,
const Requantize32 &qp,
const int32_t *col_bias,
217 const int bblocks =
iceildiv(n_max - n_0, strategy::out_width());
220 #ifdef CYCLE_PROFILING
221 auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
224 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
228 #ifdef CYCLE_PROFILING
229 auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, ((m_max-m_0) * bblocks * strategy::out_width() *
sizeof(Tr)));
235 for (
int i=0; i<bblocks; i++) {
236 unsigned int n_start = n_0 + (strategy::out_width() * i);
237 unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
240 const int32_t *row_bias =
reinterpret_cast<const int32_t *
>(a_ptr + strategy::out_height() * kern_k);
243 c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
244 c_ptr + m_0 * ldc + n_start, ldc,
245 row_bias, col_bias + n_start, n_start);
258 template<
typename strategy,
bool quantized>
259 class transform_type {
261 typedef decltype(strategy::transforms)
type;
264 template<
typename strategy>
265 class transform_type<
strategy, true> {
267 typedef decltype(strategy::transforms_quantized)
type;
271 template<
typename strategy,
typename OutputStage,
bool ForceFloat>
272 class accumulate_buffer_type {
274 typedef typename strategy::result_type
type;
277 template<
typename strategy>
278 class accumulate_buffer_type<
strategy, Requantize32, false> {
280 typedef int32_t
type;
283 template<
typename strategy,
typename OutputStage>
284 class accumulate_buffer_type<
strategy, OutputStage, true> {
290 template<
typename strategy,
bool FixedFormat>
291 struct get_stripe_width {
292 static unsigned int get() {
297 template<
typename strategy>
298 struct get_stripe_width<
strategy, true> {
299 static unsigned int get() {
300 return strategy::stripe_width();
305 template<
typename strategy,
bool FixedFormat,
typename To>
306 struct get_kernel_weight_format {
312 template<
typename strategy,
typename To>
313 struct get_kernel_weight_format<
strategy, true, To> {
319 if (std::is_same<To, float>::value && std::is_same<typename strategy::operand_type, bfloat16>::value) {
320 uint32_t kwf_i =
static_cast<uint32_t
>(kwf);
331 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing,
bool MergeStep=true,
bool FixedFormat=false,
bool ForceThreadColumns=false,
bool ForceFloatAccumulate=false>
333 typedef typename strategy::operand_type Toi;
334 typedef typename strategy::result_type Tri;
335 typedef typename accumulate_buffer_type<strategy, OutputStage, ForceFloatAccumulate>::type Tab;
340 const unsigned int _Msize;
341 const unsigned int _Nsize;
342 const unsigned int _Ksize;
343 const unsigned int _Ksections;
344 const unsigned int _Ktotal;
345 const unsigned int _rounded_Ksize;
347 const unsigned int _nbatches;
348 const unsigned int _nmulti;
350 const bool _thread_columns;
354 const int _maxthreads;
358 unsigned int _k_block=0;
359 unsigned int _x_block=0;
360 unsigned int _Mround=0;
363 const Toi *_B_transposed=
nullptr;
364 void *_working_space=
nullptr;
366 Tab *_accumulation_buffer=
nullptr;
372 int32_t *col_bias =
nullptr;
375 const To *
const *
const * _indirect_buf =
nullptr;
378 std::unique_ptr<convolver<To>> _convolver =
nullptr;
380 unsigned int get_col_sum_size()
const {
381 if (std::is_same<OutputStage, Requantize32>::value) {
382 return _Nsize * _nmulti *
sizeof(int32_t);
396 unsigned int _k0=0, _x0=0, _multi=0;
399 unsigned int _x_start=0;
400 unsigned int _x_end=_parent._Nsize;
402 unsigned int _index=0;
404 bool _newkblock=
true;
413 unsigned int xmax() {
414 return std::min(_x0 + _parent._x_block, _x_end);
417 unsigned int kmax() {
418 return std::min(_k0 + _parent._k_block, _parent._Ktotal);
428 _x0 += _parent._x_block;
431 _k0 += _parent._k_block;
432 if (_k0 >= _parent._Ktotal) {
435 if (_multi >= _parent._nmulti) {
448 unsigned int k0(
void) {
return _k0; }
449 unsigned int x0(
void) {
return _x0; }
450 unsigned int multi(
void) {
return _multi; }
451 unsigned int index(
void) {
return _index; }
452 bool done(
void) {
return _done; }
453 bool newkblock(
void) {
return _newkblock; }
460 unsigned int get_total_k_depth()
const {
461 unsigned int k_depth = _k_block;
463 if (std::is_same<OutputStage, Requantize32>::value) {
464 k_depth +=
sizeof(int32_t) /
sizeof(Toi);
471 size_t get_a_working_size()
const {
472 if (_thread_columns) {
474 return ROUND_UP(
sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
477 return ROUND_UP(
sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
482 size_t get_c_working_size()
const {
484 return ROUND_UP(
sizeof(Tri) * _x_block * strategy::out_height());
491 size_t get_accumulation_buffer_size()
const {
498 if (_k_block == _Ktotal) {
503 size_t size_per_buffer =
sizeof(Tab) * strategy::out_height() * strategy::out_width();
504 size_t num_buffers =
iceildiv(_Msize, strategy::out_height()) *
iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
506 return num_buffers * size_per_buffer;
510 Tab *get_accumulation_buffer(
unsigned int M,
unsigned int N,
unsigned int batch,
unsigned int multi)
const {
512 if (_accumulation_buffer ==
nullptr) {
517 size_t size_per_buffer = strategy::out_height() * strategy::out_width();
519 size_t buffer_rows =
iceildiv(_Msize, strategy::out_height());
520 size_t buffer_cols =
iceildiv(_Nsize, strategy::out_width());
521 size_t buffers_per_batch = (buffer_rows * buffer_cols);
522 size_t buffers_per_multi = buffers_per_batch * _nbatches;
525 size_t row =
M / strategy::out_height();
526 assert(
M % strategy::out_height() == 0);
527 size_t col =
N / strategy::out_width();
528 assert(
N % strategy::out_width() == 0);
530 size_t buffer_index = multi * buffers_per_multi +
batch * buffers_per_batch + row * buffer_cols + col;
532 return _accumulation_buffer + (buffer_index * size_per_buffer);
535 int32_t row_sum_multiplier()
const {
536 if (std::is_same<OutputStage, Requantize32>::value) {
548 if (ForceThreadColumns) {
553 if (
args._maxthreads == 1) {
558 int m_blocks =
iceildiv(
args._Msize, strategy::out_height()) *
args._nbatches;
561 if (
args._maxthreads > m_blocks) {
566 if (((
roundup(m_blocks,
args._maxthreads) * 100) / m_blocks) > 120) {
577 static unsigned int get_k_block_size(
const GemmArgs &
args) {
578 if (
args._cfg &&
args._cfg->inner_block_size) {
579 return roundup(
args._cfg->inner_block_size, strategy::k_unroll());
583 if (std::is_same<OutputStage, Requantize32>::value) {
584 return get_ktotal(
args);
590 unsigned int scaling_threshold = 1280 /
sizeof(Toi);
592 if (get_ktotal(
args) <= scaling_threshold) {
593 return get_ktotal(
args);
598 unsigned int max_block_size = 1024 /
sizeof(Toi);
600 unsigned int num_k_blocks =
iceildiv(get_ktotal(
args), max_block_size);
602 unsigned int k_block =
roundup(
iceildiv(get_ktotal(
args), num_k_blocks), strategy::k_unroll());
607 const unsigned int L1_size =
args._ci->get_L1_cache_size();
608 unsigned int k_block;
612 k_block = (L1_size / 2) / (
sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
615 k_block /= strategy::k_unroll();
616 k_block = std::max(k_block, 1U) * strategy::k_unroll();
619 unsigned int num_k_blocks =
iceildiv(get_ktotal(
args), k_block);
625 k_block =
roundup(k_block, strategy::k_unroll());
632 static unsigned int get_x_block_size(
const GemmArgs &
args) {
633 if (is_thread_columns(
args)) {
635 return roundup(
args._Nsize, strategy::out_width());
638 if (
args._cfg &&
args._cfg->outer_block_size) {
639 return roundup(
args._cfg->outer_block_size, strategy::out_width());
642 unsigned int x_block;
643 const unsigned int L2_size =
args._ci->get_L2_cache_size();
644 const unsigned int k_block = get_k_block_size(
args);
648 const unsigned int scaled_l2_size = (L2_size * 9) / 10;
649 const unsigned int k_block_area = k_block *
sizeof(Toi) * (strategy::out_width() + strategy::out_height());
652 if (k_block_area > scaled_l2_size) {
653 return strategy::out_width();
656 x_block = (scaled_l2_size - k_block_area) / (
sizeof(Toi) * k_block);
659 x_block /= strategy::out_width();
660 x_block = std::max(x_block, 1u) * strategy::out_width();
663 unsigned int num_x_blocks =
iceildiv(
args._Nsize, x_block);
666 x_block =
roundup(x_block, strategy::out_width());
679 : _ci(
args._ci), _Msize(
args._Msize), _Nsize(
args._Nsize), _Ksize(
args._Ksize),
680 _Ksections(
args._Ksections), _Ktotal(get_ktotal(
args)),
682 _nbatches(
args._nbatches), _nmulti(
args._nmulti), _thread_columns(is_thread_columns(
args)),
683 _act(
args._act), _maxthreads(
args._maxthreads), _nthreads(
args._maxthreads),
689 : _ci(
args._ci), _Msize(
args._Msize), _Nsize(
args._Nsize), _Ksize(
args._Ksize),
690 _Ksections(
args._Ksections), _Ktotal(get_ktotal(
args)),
692 _nbatches(
args._nbatches), _nmulti(
args._nmulti), _thread_columns(is_thread_columns(
args)),
693 _act(
args._act), _maxthreads(
args._maxthreads), _nthreads(
args._maxthreads),
704 unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
706 if (_thread_columns) {
707 return { row_blocks,
iceildiv(_Nsize, strategy::out_width()) };
710 return { row_blocks };
716 _nthreads = std::min(nthreads, _maxthreads);
721 #ifdef CYCLE_PROFILING
726 assert(FixedFormat || _B_transposed);
727 assert(_working_space);
728 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(_working_space);
731 intptr_t working_space_v =
reinterpret_cast<intptr_t
>(_working_space);
732 if (working_space_v & 0x3f) {
733 intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
734 working_space_bytes += alignment_offset;
743 const unsigned int window_per_batch = _Mround / strategy::out_height();
744 unsigned int batch_0 = start / window_per_batch;
745 unsigned int batch_end =
end / window_per_batch;
749 if (_thread_columns) {
750 const auto start_x = work_range.
get_position(1) * strategy::out_width();
751 const auto end_x = std::min(work_range.
get_position_end(1) * strategy::out_width(), _Nsize);
753 Tri *
const c_panel =
reinterpret_cast<Tri *
>(working_space_bytes + (threadid * get_c_working_size()));
754 Toi *
const a_panel =
reinterpret_cast<Toi *
>(working_space_bytes + (_maxthreads * get_c_working_size()) +
755 (threadid *
sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
757 for (
unsigned int multi=0; multi<_nmulti; multi++) {
758 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
759 unsigned int kmax=std::min(k0+_k_block, _Ktotal);
761 unsigned int rounded_width =
roundup(_Nsize, strategy::out_width());
763 const bool first_pass = (k0==0);
764 const bool last_pass = (kmax==_Ktotal);
767 unsigned int kern_k =
roundup(kmax - k0, strategy::k_unroll());
769 const Toi *b_ptr = FixedFormat ?
770 reinterpret_cast<const Toi *
>(this->_Bptr) + (multi * this->_B_multi_stride) +
771 ((start_x / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
772 (k0 * get_stripe_width<strategy, FixedFormat>::get()) :
773 _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
775 unsigned int batch = batch_0;
776 unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
778 for (
unsigned int p=start; p<
end; p++) {
779 unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
783 #ifdef CYCLE_PROFILING
784 auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) *
sizeof(Toi));
788 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
790 if (_indirect_buf !=
nullptr) {
791 transforms.PrepareA_indirect(a_panel,
792 _indirect_buf + (multi * _nbatches * _Ksections) + (
batch * _Ksections), _Ksize,
793 _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
794 }
else if (_convolver) {
795 transforms.PrepareA_convolution(a_panel,
796 this->_Aptr + (
batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
797 this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
799 transforms.PrepareA(a_panel,
800 this->_Aptr + (
batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
801 this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
807 #ifdef CYCLE_PROFILING
811 strat, a_panel, b_ptr, this->_ldb, c_panel,
813 this->_Cptr + (
batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
815 kern_k, start_row, end_row, start_x, end_x,
817 ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) :
nullptr),
819 (last_pass ? _act :
Activation()), !first_pass,
821 _os, col_bias + (multi * _Nsize),
823 get_accumulation_buffer(start_row, start_x,
batch, multi));
826 start_row += strategy::out_height();
827 if (start_row >= _Msize) {
835 blockwalker current(*
this);
838 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
839 unsigned int m_max = (
end - (batch_end * window_per_batch)) * strategy::out_height();
845 Toi *
const a_panel =
reinterpret_cast<Toi *
>(working_space_bytes + (_maxthreads * get_c_working_size()));
846 Tri *
const c_panel =
reinterpret_cast<Tri *
>(working_space_bytes + (threadid * get_c_working_size()));
849 b_panel = _B_transposed;
861 unsigned int kern_k = 0;
862 unsigned int a_panel_stride = 0;
864 for (;!current.done();current.advance()) {
865 if (current.newkblock()) {
866 #ifdef CYCLE_PROFILING
867 auto p=prof.ScopedProfiler(PROFILE_PREPA, (
end - start) * strategy::out_height() * (current.kmax()-current.k0()) *
sizeof(Toi));
871 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
874 unsigned int first_m = (
batch == batch_0) ? m_0 : 0;
875 unsigned int last_m = (
batch == batch_end) ? m_max : _Msize;
877 if (first_m >= last_m)
880 if (_indirect_buf !=
nullptr) {
881 transforms.PrepareA_indirect(a_panel + ((
batch * _Mround + first_m) * get_total_k_depth()),
882 _indirect_buf + (current.multi() * _nbatches * _Ksections) + (
batch * _Ksections), _Ksize,
883 _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
884 }
else if (_convolver) {
885 transforms.PrepareA_convolution(a_panel + ((
batch * _Mround + first_m) * get_total_k_depth()),
886 this->_Aptr + (
batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
887 this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
889 transforms.PrepareA(a_panel + ((
batch * _Mround + first_m) * get_total_k_depth()),
890 this->_Aptr + (
batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
891 this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
896 kern_k =
roundup(current.kmax() - current.k0(), strategy::k_unroll());
902 if(std::is_same<OutputStage, Requantize32>::value) {
903 a_panel_stride = kern_k + (
sizeof(int32_t) /
sizeof(Toi));
905 a_panel_stride = kern_k;
911 b_panel =
reinterpret_cast<const Toi *
>(this->_Bptr) + (current.multi() * this->_B_multi_stride) +
912 ((current.x0() / get_stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
913 (current.k0() * get_stripe_width<strategy, FixedFormat>::get());
918 unsigned int first_m = (
batch == batch_0) ? m_0 : 0;
919 unsigned int last_m = (
batch == batch_end) ? m_max : _Msize;
921 const Toi *a_ptr = a_panel + (
batch * _Mround + first_m) * get_total_k_depth();
923 if (first_m >= last_m)
930 unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
934 if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
935 m_step = strategy::out_height();
938 for (
unsigned int y=first_m; y<last_m; y+=m_step) {
939 unsigned int ymax = std::min(_Msize, y + m_step);
941 const bool first_pass = (current.k0() == 0);
942 const bool last_pass = (current.kmax() == _Ktotal);
945 Tr *result_ptr = this->_Cptr + (
batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
949 if (_accumulation_buffer && !last_pass) {
950 result_ptr =
nullptr;
955 #ifdef CYCLE_PROFILING
959 strat, a_ptr, b_panel, this->_ldb, c_panel,
961 result_ptr, this->_ldc,
963 kern_k, y, ymax, current.x0(), current.xmax(),
965 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) :
nullptr),
967 (last_pass ? _act :
Activation()), !first_pass,
969 _os, col_bias + (current.multi() * _Nsize),
971 get_accumulation_buffer(y, current.x0(),
batch, current.multi()) );
973 a_ptr += (strategy::out_height() * a_panel_stride);
977 if (FixedFormat ==
false) {
978 b_panel += (
roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
987 size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
996 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(working_space);
997 intptr_t working_space_int =
reinterpret_cast<intptr_t
>(working_space);
1001 if (working_space_int & 0x3F) {
1002 diff = 0x40 - (working_space_int & 0x3F);
1005 working_space_bytes += diff;
1006 working_space_int += diff;
1009 _working_space =
reinterpret_cast<void *
>(working_space_bytes);
1012 if (get_accumulation_buffer_size() > 0) {
1013 intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
1015 if (acc_buff_int & 0x3F) {
1016 acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
1018 _accumulation_buffer =
reinterpret_cast<Tab *
>(acc_buff_int);
1020 _accumulation_buffer =
nullptr;
1026 return (FixedFormat ==
false);
1030 return (FixedFormat ==
false) && (_B_transposed==
nullptr);
1038 unsigned int x_size =
roundup(_Nsize, strategy::out_width());
1040 return (x_size * _Ktotal * _nmulti *
sizeof(Toi)) + get_col_sum_size();
1044 size_t n_blocks =
iceildiv(_Nsize, _x_block);
1045 size_t k_blocks =
iceildiv(_Ktotal, _k_block);
1047 return n_blocks * k_blocks * _nmulti;
1050 void requantize_bias(
void *in_buffer,
const To *B,
const int ldb,
const int B_multi_stride)
override {
1051 if (std::is_same<OutputStage, Requantize32>::value) {
1052 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
1056 for (
unsigned int i=0; i<_nmulti; i++) {
1058 compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
1074 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
1075 Toi *buffer =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
1076 _B_transposed = buffer;
1078 blockwalker current(*
this);
1082 for(
size_t i = 0; i < start; i++) {
1083 buffer +=
roundup(current.xmax() - current.x0(), strategy::out_width()) *
roundup(current.kmax() - current.k0(), strategy::k_unroll());
1087 size_t blocks_left = (
end - start);
1090 if (current.done()) {
1094 for (; blocks_left > 0; blocks_left--) {
1096 unsigned int k_size = (current.kmax() - current.k0());
1098 if (_Ksections > 1) {
1106 const unsigned int rounded_section_size =
roundup(_Ksize, strategy::k_unroll());
1111 for (
unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ) {
1112 unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
1115 unsigned int kpos = current.k0();
1116 unsigned int kleft = k_size;
1120 unsigned int k_section_base = kpos / rounded_section_size;
1122 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
1125 unsigned int k_length = std::min(_Ksize - k_offset, kleft);
1127 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
1129 (k_section_base * _Ksize) + k_offset,
1130 (k_section_base * _Ksize) + k_offset + k_length);
1133 unsigned int padded_length =
roundup(k_length, strategy::k_unroll());
1135 buffer += strategy::out_width() * padded_length;
1137 kpos += padded_length;
1138 kleft -= padded_length;
1144 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
1145 current.x0(), current.xmax(), current.k0(), std::min(current.kmax(), _Ksize));
1146 buffer +=
roundup(current.xmax() - current.x0(), strategy::out_width()) *
roundup(current.kmax() - current.k0(), strategy::k_unroll());
1150 if (!current.advance()) {
1158 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
1159 _B_transposed =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
1160 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
1164 if (std::is_same<OutputStage, Requantize32>::value) {
1173 assert(string_len == _Ksize);
1174 _indirect_buf = ptr;
1179 _convolver = std::unique_ptr<convolver<To>>(
new convolver<To>(parms));
1183 template<
typename perf_type>
1189 uint64_t total_macs =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti *
roundup(
args._Msize, strategy::out_height()) *
roundup(
args._Nsize, strategy::out_width()) * get_ktotal(
args);
1190 uint64_t prepare_bytes =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti *
roundup(
args._Msize, strategy::out_height()) * get_ktotal(
args) *
sizeof(Toi);
1191 uint64_t merge_bytes =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti * k_blocks *
args._Msize *
roundup(
args._Nsize, strategy::out_width()) *
sizeof(Tr);
1195 float merge_cycles =
static_cast<float>(merge_bytes) / params.
merge_bytes_cycle;
1197 float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
1201 float parallelism_available =
static_cast<float>(
iceildiv(
args._Msize, strategy::out_height()) *
args._nbatches) * 0.9f;
1203 if (parallelism_available <
args._maxthreads) {
1204 total_cycles *= (
static_cast<float>(
args._maxthreads) / parallelism_available);
1207 return static_cast<uint64_t
>(total_cycles);
1216 c.
filter = get_type_name<strategy>();
1224 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
1227 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
1230 template<
typename strategy,
typename To,
typename Tr>
1233 template<
typename strategy,
typename To,
typename Tr>