26 #if !defined(_WIN64) && !defined(__OpenBSD__)
42 #ifdef CYCLE_PROFILING
43 #include "profiler.hpp"
47 #define __I_DEFINED_UNUSED
48 #define UNUSED(x) ((void)(x))
58 template<
typename OutputStage,
bool SeparateQuantize,
bool FixedFormat>
59 class run_hybrid_kernel {
61 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
62 static inline void run (
63 #ifdef CYCLE_PROFILING
66 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
67 unsigned int kern_k,
const Tro *b_ptr,
size_t b_stride, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr,
Activation act,
bool accumulate,
68 const OutputStage &os,
const int32_t *col_bias,
unsigned int n_0 );
72 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
74 #ifdef CYCLE_PROFILING
77 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
78 unsigned int kern_k,
const Tro *b_ptr,
size_t, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr,
Activation act,
bool accumulate,
79 const Nothing &,
const int32_t *,
unsigned int) {
80 #ifdef CYCLE_PROFILING
81 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)
M * kern_k *
roundup(
N, strategy::out_width()));
87 if (bias_ptr && !
accumulate && (
N % strategy::out_width() != 0)) {
89 unsigned int N_remainder =
N % strategy::out_width();
90 unsigned int N_bulk =
N - N_remainder;
93 IndirectOutputArg<Tr> offset_output = output_arg;
97 strat.kernel(num_strings, string_ptr, A_arg,
M, N_bulk, b_ptr, output_arg, bias_ptr, act,
accumulate);
99 if (output_arg.is_indirect) {
100 offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
102 offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
107 Tr *bias_pad_buffer =
reinterpret_cast<Tr *
>(alloca(strategy::out_width() *
sizeof(Tr)));
108 memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder *
sizeof(Tr));
111 strat.kernel(num_strings, string_ptr, A_arg,
M, N_remainder, b_ptr + (N_bulk * kern_k), offset_output, bias_pad_buffer, act,
accumulate);
113 strat.kernel(num_strings, string_ptr, A_arg,
M,
N, b_ptr, output_arg, bias_ptr, act,
accumulate);
118 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
120 #ifdef CYCLE_PROFILING
123 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
124 unsigned int kern_k,
const Tro *b_ptr,
size_t b_stride, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr,
Activation act,
bool accumulate,
125 const Nothing &,
const int32_t *,
unsigned int) {
126 #ifdef CYCLE_PROFILING
127 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)
M * kern_k *
roundup(
N, strategy::out_width()));
133 if (bias_ptr && !
accumulate && (
N % strategy::out_width() != 0)) {
135 unsigned int N_remainder =
N % strategy::out_width();
136 unsigned int N_bulk =
N - N_remainder;
139 IndirectOutputArg<Tr> offset_output = output_arg;
143 strat.kernel(num_strings, string_ptr, A_arg,
M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act,
accumulate);
145 if (output_arg.is_indirect) {
146 offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
148 offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
153 Tr *bias_pad_buffer =
reinterpret_cast<Tr *
>(alloca(strategy::out_width() *
sizeof(Tr)));
154 memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder *
sizeof(Tr));
157 strat.kernel(num_strings, string_ptr, A_arg,
M, N_remainder,
158 b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output,
161 strat.kernel(num_strings, string_ptr, A_arg,
M,
N, b_ptr, b_stride, output_arg, bias_ptr, act,
accumulate);
166 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
168 #ifdef CYCLE_PROFILING
171 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
172 unsigned int kern_k,
const Tro *b_ptr,
size_t, IndirectOutputArg<Tr> output_arg,
const Tr *,
Activation,
bool,
173 const Requantize32 &os,
const int32_t *col_bias,
unsigned int n_0 ) {
174 #ifdef CYCLE_PROFILING
175 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)
M * kern_k *
roundup(
N, strategy::out_width()));
179 strat.kernel(num_strings, string_ptr, A_arg,
M,
N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
183 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
185 #ifdef CYCLE_PROFILING
188 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
189 unsigned int kern_k,
const Tro *b_ptr,
size_t, IndirectOutputArg<Tr> output_arg,
const Tr *,
Activation,
bool,
190 const Requantize32 &os,
const int32_t *col_bias,
unsigned int n_0 ) {
193 assert(
M <= strategy::out_height());
195 assert(output_arg.is_indirect ==
false);
199 int32_t row_sums[strategy::out_height()];
200 typename strategy::result_type *result_buffer;
202 unsigned int output_width =
roundup(
N, strategy::out_width());
204 result_buffer =
reinterpret_cast<typename strategy::result_type *
>(alloca(output_width * strategy::out_height() *
sizeof(
typename strategy::result_type)));
207 #ifdef CYCLE_PROFILING
208 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)
M * kern_k *
roundup(
N, strategy::out_width()));
211 strat.kernel(num_strings, string_ptr, A_arg,
M,
N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width),
nullptr,
Activation(),
false);
214 if (os.b_offset != 0) {
215 #ifdef CYCLE_PROFILING
216 auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (
unsigned long)
M * kern_k);
220 memset(row_sums, 0,
sizeof(int32_t) * strategy::out_height());
224 #ifdef CYCLE_PROFILING
225 auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (
unsigned long)
M *
N);
228 requantize_block_32(os,
N,
M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
232 template<
typename strategy,
bool FixedFormat>
233 struct stripe_width {
234 static unsigned int get() {
235 return strategy::stripe_width();
239 template<
typename strategy>
240 struct stripe_width<
strategy, false> {
241 static unsigned int get() {
246 template<
typename strategy,
bool FixedFormat>
247 struct kernel_weight_format {
249 return strategy::kernel_weight_format();
253 template<
typename strategy>
254 struct kernel_weight_format<
strategy, false> {
263 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing,
bool SeparateQuantize=false,
bool FixedFormat=false>
265 typedef typename strategy::lhs_operand_type Tloi;
266 typedef typename strategy::rhs_operand_type Troi;
267 typedef typename strategy::result_type Tri;
270 OutputStage _os = {};
273 int32_t *_col_bias =
nullptr;
275 const unsigned int _Ktotal;
276 const unsigned int _rounded_Ksize;
279 const unsigned int _k_block;
280 const unsigned int _n_block;
281 const unsigned int _Mround;
284 const Troi *_B_transposed=
nullptr;
287 const To *
const *
const * _indirect_buf =
nullptr;
290 std::unique_ptr<convolver<To>> _convolver =
nullptr;
297 unsigned int get_col_sum_size()
const {
298 if (std::is_same<OutputStage, Requantize32>::value) {
309 static unsigned int compute_k_block(
const GemmArgs &
args) {
311 if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
312 return get_ktotal(
args);
315 if (
args._cfg &&
args._cfg->inner_block_size) {
316 return roundup(
args._cfg->inner_block_size, strategy::k_unroll());
321 unsigned int target_block_size = 2048 /
sizeof(To);
322 auto ktotal = get_ktotal(
args);
324 if (ktotal > ((target_block_size*3)/2)) {
325 unsigned int target_blocks =
iceildiv(ktotal, target_block_size);
327 unsigned int block_size =
iceildiv(ktotal, target_blocks);
329 block_size =
roundup(block_size, strategy::k_unroll());
339 static unsigned int compute_n_block(
const GemmArgs &
args,
const OutputStage os = {}) {
340 if (
args._cfg &&
args._cfg->outer_block_size) {
341 return args._cfg->outer_block_size;
344 if (
args._Nsize <= 64) {
348 if ((
args._Msize /
args._Nsize) > 155) {
355 if (std::is_same<OutputStage, Requantize32>::value) {
361 int multi_row_parallelism =
args._nmulti *
args._nbatches *
iceildiv(
args._Msize, strategy::out_height());
364 if (multi_row_parallelism <
args._maxthreads) {
365 unsigned int columns_needed =
iceildiv(
args._maxthreads, multi_row_parallelism);
367 unsigned int n_block =
iceildiv(
args._Nsize, columns_needed);
369 return roundup(n_block, strategy::out_width());
377 if (
args._Ksize <= 128 &&
args._maxthreads <= 16) {
378 return strategy::out_width() * 3;
381 return strategy::out_width();
390 : _args(
args), _os(os), _Ktotal(get_ktotal(
args)),
392 _k_block(compute_k_block(
args)), _n_block(compute_n_block(
args, os)),
399 _args.
_cfg =
nullptr;
404 : _args(
args), _Ktotal(get_ktotal(
args)),
406 _k_block(compute_k_block(
args)), _n_block(compute_n_block(
args)),
413 _args.
_cfg =
nullptr;
428 #ifdef CYCLE_PROFILING
433 std::vector<const To *> in_row_ptrs;
434 std::vector<const To * const *> in_row_strings;
435 std::vector<unsigned int> string_lengths;
439 in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args.
_Ksections,
nullptr);
440 in_row_strings = std::vector<const To * const *>(_args.
_Ksections,
nullptr);
442 for (
unsigned int i=0; i<_args.
_Ksections; i++) {
443 in_row_strings[i] = &(in_row_ptrs.data()[i * strategy::out_height()]);
449 string_lengths = std::vector<unsigned int>(_args.
_Ksections, 0);
453 assert(FixedFormat || _B_transposed);
454 static_assert(std::is_same<To, Tloi>::value,
"gemm_native: Operand types must be the same.");
460 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
461 unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
462 unsigned int kern_k =
roundup(kmax-k0, strategy::k_unroll());
464 const bool first_pass = (k0 == 0);
465 const bool last_pass = (kmax == _Ktotal);
467 unsigned int first_section = (k0 / _rounded_Ksize);
468 unsigned int first_offset = (k0 % _rounded_Ksize);
469 unsigned int kleft = kern_k;
471 unsigned int offset = first_offset;
479 kleft -= std::min(kleft, _rounded_Ksize -
offset);
494 const bool process_all_rows = (!SeparateQuantize && !_convolver);
497 const unsigned int m_start = p.dim(0) * strategy::out_height();
498 const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args.
_Msize) : std::min(m_start + strategy::out_height(), _args.
_Msize);
500 const unsigned int batch = p.dim(1);
501 const unsigned int n0 = p.dim(2) * _n_block;
502 const unsigned int nmax = std::min(n0 + _n_block, _args.
_Nsize);
503 const unsigned int multi = p.dim(3);
507 b_panel =
reinterpret_cast<const Troi *
>(this->_Bptr) +
508 (multi * this->_B_multi_stride) +
509 ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
510 (k0 * stripe_width<strategy, FixedFormat>::get());
512 b_panel = _B_transposed +
513 (multi *
roundup(_args.
_Nsize, strategy::out_width()) * _Ktotal) +
518 IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
520 #ifdef CYCLE_PROFILING
521 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(m_end - m_start) * kern_k *
roundup(nmax-n0, strategy::out_width()));
525 #ifdef CYCLE_PROFILING
528 strat,
sections, string_lengths.data(),
530 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
531 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
535 _os, _col_bias+(multi * _args.
_Nsize), n0);
536 }
else if (_convolver) {
537 auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
540 auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
542 while (!conv_rows.finished()) {
543 unsigned int width, conv_offset;
547 std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
550 assert(conv_offset == first_offset);
552 assert(width == string_lengths[pos]);
558 #ifdef CYCLE_PROFILING
561 strat,
sections, string_lengths.data(),
563 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
564 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
568 _os, _col_bias+(multi * _args.
_Nsize), n0);
571 const unsigned int len = (std::min(_args.
_Ksize, kmax) - k0);
574 #ifdef CYCLE_PROFILING
578 IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
579 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
580 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
584 _os, _col_bias+(multi * _args.
_Nsize), n0);
586 }
while (process_all_rows ? p.next_dim1() : p.next_dim0());
592 return (FixedFormat ==
false);
596 return (FixedFormat ==
false) && (_B_transposed==
nullptr);
605 size_t size =
roundup(_args.
_Nsize, strategy::out_width()) * _Ktotal * _args.
_nmulti *
sizeof(Troi);
610 if (std::is_same<OutputStage, Requantize32>::value) {
611 size += get_col_sum_size();
621 void requantize_bias(
void *in_buffer,
const To *B,
const int ldb,
const int B_multi_stride)
override {
622 if (std::is_same<OutputStage, Requantize32>::value) {
623 _col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
627 for (
unsigned int i=0; i<_args.
_nmulti; i++) {
644 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
645 Troi *buffer_base =
reinterpret_cast<Troi *
>(buffer_int + get_col_sum_size());
646 _B_transposed = buffer_base;
649 size_t work_per_multi =
iceildiv(_args.
_Nsize, strategy::out_width());
651 for (
unsigned int multi=(start / work_per_multi); multi<_args.
_nmulti; multi++) {
654 size_t wk_start = multi * work_per_multi;
655 size_t wk_end = (multi + 1) * work_per_multi;
657 assert(wk_end > start);
659 if (wk_start >=
end) {
663 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
664 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
667 unsigned int k_size = kmax - k0;
671 size_t n_end = _args.
_Nsize;
674 if (start > wk_start) {
675 n_start = (start - wk_start) * strategy::out_width();
680 n_end = (
end - wk_start) * strategy::out_width();
684 Troi *buffer = buffer_base +
685 (
roundup(_args.
_Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) +
686 (n_start *
roundup(k_size, strategy::k_unroll()));
696 const unsigned int rounded_section_size =
roundup(_args.
_Ksize, strategy::k_unroll());
701 for (
unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) {
702 unsigned int xmax = std::min(x0 + strategy::out_width(), _args.
_Nsize);
705 unsigned int kpos = k0;
706 unsigned int kleft = k_size;
710 unsigned int k_section_base = kpos / rounded_section_size;
712 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
715 unsigned int k_length = std::min(_args.
_Ksize - k_offset, kleft);
717 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
719 (k_section_base * _args.
_Ksize) + k_offset,
720 (k_section_base * _args.
_Ksize) + k_offset + k_length);
723 unsigned int padded_length =
roundup(k_length, strategy::k_unroll());
725 buffer += strategy::out_width() * padded_length;
727 kpos += padded_length;
728 kleft -= padded_length;
733 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
734 n_start, n_end, k0, std::min(kmax, _args.
_Ksize));
742 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
743 _B_transposed =
reinterpret_cast<Troi *
>(buffer_int + get_col_sum_size());
744 _col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
750 template <
typename perf_type>
757 uint64_t total_macs =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti *
args._Msize *
roundup(
args._Nsize, strategy::out_width()) * get_ktotal(
args);
765 if ((
args._Nsize < strategy::out_width()) || (
args._Nsize > strategy::out_width() &&
args._Nsize < 2*strategy::out_width())) {
769 uint64_t total_cycles = mac_cycles;
772 if (std::is_same<OutputStage, Requantize32>::value && SeparateQuantize) {
773 const Requantize32 *qp =
reinterpret_cast<const Requantize32 *
>(&os);
776 uint64_t rowsum_bytes =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti *
args._Msize * get_ktotal(
args);
779 if (qp->b_offset == 0) {
787 uint64_t requantize_bytes =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti *
args._Msize *
args._Nsize;
790 float requantize_cycles =
static_cast<float>(requantize_bytes) / params.
merge_bytes_cycle;
793 total_cycles = mac_cycles + rowsum_cycles + requantize_cycles;
800 if (std::is_same<OutputStage, Requantize32>::value) {
809 assert(string_len == _args.
_Ksize);
815 _convolver = std::unique_ptr<convolver<To>>(
new convolver<To>(parms));
824 c.
filter = get_type_name<strategy>();
831 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
836 #ifdef __I_DEFINED_UNUSED