26 #if !defined(_WIN64) && !defined(__OpenBSD__) 42 #ifdef CYCLE_PROFILING 43 #include "profiler.hpp" 47 #define __I_DEFINED_UNUSED 48 #define UNUSED(x) ((void)(x)) 58 template<
typename OutputStage,
bool SeparateQuantize,
bool FixedFormat>
59 class run_hybrid_kernel {
61 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
62 static inline void run (
63 #ifdef CYCLE_PROFILING
66 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
67 unsigned int kern_k,
const Tro *b_ptr,
size_t b_stride, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr, Activation act,
bool accumulate,
68 const OutputStage &os,
const int32_t *col_bias,
unsigned int n_0 );
72 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
74 #ifdef CYCLE_PROFILING
77 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
78 unsigned int kern_k,
const Tro *b_ptr,
size_t, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr, Activation act,
bool accumulate,
79 const Nothing &,
const int32_t *,
unsigned int) {
80 #ifdef CYCLE_PROFILING 81 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
87 if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
89 unsigned int N_remainder = N % strategy::out_width();
90 unsigned int N_bulk = N - N_remainder;
93 IndirectOutputArg<Tr> offset_output = output_arg;
97 strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, output_arg, bias_ptr, act, accumulate);
99 if (output_arg.is_indirect) {
100 offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
102 offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
107 Tr *bias_pad_buffer =
reinterpret_cast<Tr *
>(alloca(strategy::out_width() *
sizeof(Tr)));
108 memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder *
sizeof(Tr));
111 strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder, b_ptr + (N_bulk * kern_k), offset_output, bias_pad_buffer, act, accumulate);
113 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
118 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
120 #ifdef CYCLE_PROFILING
123 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
124 unsigned int kern_k,
const Tro *b_ptr,
size_t b_stride, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr, Activation act,
bool accumulate,
125 const Nothing &,
const int32_t *,
unsigned int) {
126 #ifdef CYCLE_PROFILING 127 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
133 if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
135 unsigned int N_remainder = N % strategy::out_width();
136 unsigned int N_bulk = N - N_remainder;
139 IndirectOutputArg<Tr> offset_output = output_arg;
143 strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
145 if (output_arg.is_indirect) {
146 offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
148 offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
153 Tr *bias_pad_buffer =
reinterpret_cast<Tr *
>(alloca(strategy::out_width() *
sizeof(Tr)));
154 memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder *
sizeof(Tr));
157 strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder,
158 b_ptr + (N_bulk / strategy::stripe_width()) * b_stride, b_stride, offset_output,
159 bias_pad_buffer, act, accumulate);
161 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, b_stride, output_arg, bias_ptr, act, accumulate);
166 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
168 #ifdef CYCLE_PROFILING
171 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
172 unsigned int kern_k,
const Tro *b_ptr,
size_t, IndirectOutputArg<Tr> output_arg,
const Tr *, Activation,
bool,
173 const Requantize32 &os,
const int32_t *col_bias,
unsigned int n_0 ) {
174 #ifdef CYCLE_PROFILING 175 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
179 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
183 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
185 #ifdef CYCLE_PROFILING
188 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
189 unsigned int kern_k,
const Tro *b_ptr,
size_t, IndirectOutputArg<Tr> output_arg,
const Tr *, Activation,
bool,
190 const Requantize32 &os,
const int32_t *col_bias,
unsigned int n_0 ) {
193 assert(M <= strategy::out_height());
195 assert(output_arg.is_indirect ==
false);
199 int32_t row_sums[strategy::out_height()];
200 typename strategy::result_type *result_buffer;
202 unsigned int output_width =
roundup(N, strategy::out_width());
204 result_buffer =
reinterpret_cast<typename strategy::result_type *
>(alloca(output_width * strategy::out_height() *
sizeof(
typename strategy::result_type)));
207 #ifdef CYCLE_PROFILING 208 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
211 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width),
nullptr, Activation(),
false);
214 if (os.b_offset != 0) {
215 #ifdef CYCLE_PROFILING 216 auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (
unsigned long)M * kern_k);
220 memset(row_sums, 0,
sizeof(int32_t) * strategy::out_height());
224 #ifdef CYCLE_PROFILING 225 auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (
unsigned long)M * N);
228 requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
232 template<
typename strategy,
bool FixedFormat>
233 struct stripe_width {
234 static unsigned int get() {
235 return strategy::stripe_width();
239 template<
typename strategy>
240 struct stripe_width<strategy, false> {
241 static unsigned int get() {
246 template<
typename strategy,
bool FixedFormat>
247 struct kernel_weight_format {
249 return strategy::kernel_weight_format();
253 template<
typename strategy>
254 struct kernel_weight_format<strategy, false> {
263 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing,
bool SeparateQuantize=false,
bool FixedFormat=false>
265 typedef typename strategy::lhs_operand_type Tloi;
266 typedef typename strategy::rhs_operand_type Troi;
267 typedef typename strategy::result_type Tri;
270 OutputStage _os = {};
273 int32_t *_col_bias =
nullptr;
275 const unsigned int _Ktotal;
276 const unsigned int _rounded_Ksize;
279 const unsigned int _k_block;
280 const unsigned int _n_block;
281 const unsigned int _Mround;
284 const Troi *_B_transposed=
nullptr;
287 const To *
const *
const * _indirect_buf =
nullptr;
290 std::unique_ptr<convolver<To>> _convolver =
nullptr;
297 unsigned int get_col_sum_size()
const {
298 if (std::is_same<OutputStage, Requantize32>::value) {
309 static unsigned int compute_k_block(
const GemmArgs &args) {
311 if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
312 return get_ktotal(args);
321 unsigned int target_block_size = 2048 /
sizeof(To);
322 auto ktotal = get_ktotal(args);
324 if (ktotal > ((target_block_size*3)/2)) {
325 unsigned int target_blocks =
iceildiv(ktotal, target_block_size);
327 unsigned int block_size =
iceildiv(ktotal, target_blocks);
329 block_size =
roundup(block_size, strategy::k_unroll());
339 static unsigned int compute_n_block(
const GemmArgs &args,
const OutputStage os = {}) {
355 if (std::is_same<OutputStage, Requantize32>::value) {
369 return roundup(n_block, strategy::out_width());
378 return strategy::out_width() * 3;
381 return strategy::out_width();
390 : _args(args), _os(os), _Ktotal(get_ktotal(args)),
392 _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
394 _window_range(
iceildiv(args._Msize,
strategy::out_height()), args._nbatches,
395 iceildiv(args._Nsize, _n_block), args._nmulti)
399 _args.
_cfg =
nullptr;
404 : _args(args), _Ktotal(get_ktotal(args)),
406 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
408 _window_range(
iceildiv(args._Msize,
strategy::out_height()), args._nbatches,
409 iceildiv(args._Nsize, _n_block), args._nmulti)
413 _args.
_cfg =
nullptr;
428 #ifdef CYCLE_PROFILING 433 std::vector<const To *> in_row_ptrs;
434 std::vector<const To * const *> in_row_strings;
435 std::vector<unsigned int> string_lengths;
439 in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args.
_Ksections,
nullptr);
440 in_row_strings = std::vector<const To * const *>(_args.
_Ksections,
nullptr);
442 for (
unsigned int i=0; i<_args.
_Ksections; i++) {
443 in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
449 string_lengths = std::vector<unsigned int>(_args.
_Ksections, 0);
453 assert(FixedFormat || _B_transposed);
454 static_assert(std::is_same<To, Tloi>::value,
"gemm_native: Operand types must be the same.");
460 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
461 unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
462 unsigned int kern_k =
roundup(kmax-k0, strategy::k_unroll());
464 const bool first_pass = (k0 == 0);
465 const bool last_pass = (kmax == _Ktotal);
467 unsigned int first_section = (k0 / _rounded_Ksize);
468 unsigned int first_offset = (k0 % _rounded_Ksize);
469 unsigned int kleft = kern_k;
471 unsigned int offset = first_offset;
478 string_lengths[
sections] = std::min(kleft, _args.
_Ksize - offset);
479 kleft -= std::min(kleft, _rounded_Ksize - offset);
494 const bool process_all_rows = (!SeparateQuantize && !_convolver);
497 const unsigned int m_start = p.dim(0) * strategy::out_height();
498 const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args.
_Msize) : std::min(m_start + strategy::out_height(), _args.
_Msize);
500 const unsigned int batch = p.dim(1);
501 const unsigned int n0 = p.dim(2) * _n_block;
502 const unsigned int nmax = std::min(n0 + _n_block, _args.
_Nsize);
503 const unsigned int multi = p.dim(3);
507 b_panel =
reinterpret_cast<const Troi *
>(this->_Bptr) +
508 (multi * this->_B_multi_stride) +
509 ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
510 (k0 * stripe_width<strategy, FixedFormat>::get());
512 b_panel = _B_transposed +
513 (multi *
roundup(_args.
_Nsize, strategy::out_width()) * _Ktotal) +
518 IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
520 #ifdef CYCLE_PROFILING 521 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(m_end - m_start) * kern_k *
roundup(nmax-n0, strategy::out_width()));
525 #ifdef CYCLE_PROFILING
528 strat, sections, string_lengths.data(),
530 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
531 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
535 _os, _col_bias+(multi * _args.
_Nsize), n0);
536 }
else if (_convolver) {
537 auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
540 auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
542 while (!conv_rows.finished()) {
543 unsigned int width, conv_offset;
545 assert(pos < sections);
547 std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
550 assert(conv_offset == first_offset);
552 assert(width == string_lengths[pos]);
555 assert(pos == sections);
558 #ifdef CYCLE_PROFILING
561 strat, sections, string_lengths.data(),
563 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
564 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
568 _os, _col_bias+(multi * _args.
_Nsize), n0);
571 const unsigned int len = (std::min(_args.
_Ksize, kmax) - k0);
574 #ifdef CYCLE_PROFILING
578 IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
579 (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
580 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
584 _os, _col_bias+(multi * _args.
_Nsize), n0);
586 }
while (process_all_rows ? p.next_dim1() : p.next_dim0());
592 return (FixedFormat ==
false);
596 return (FixedFormat ==
false) && (_B_transposed==
nullptr);
605 size_t size =
roundup(_args.
_Nsize, strategy::out_width()) * _Ktotal * _args.
_nmulti *
sizeof(Troi);
610 if (std::is_same<OutputStage, Requantize32>::value) {
611 size += get_col_sum_size();
621 void requantize_bias(
void *in_buffer,
const To *B,
const int ldb,
const int B_multi_stride)
override {
622 if (std::is_same<OutputStage, Requantize32>::value) {
623 _col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
627 for (
unsigned int i=0; i<_args.
_nmulti; i++) {
635 pretranspose_B_array_part(in_buffer, B, ldb, B_multi_stride, 0, get_B_pretranspose_window_size());
639 if (end >= get_B_pretranspose_window_size()) {
640 requantize_bias(in_buffer, B, ldb, B_multi_stride);
644 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
645 Troi *buffer_base =
reinterpret_cast<Troi *
>(buffer_int + get_col_sum_size());
646 _B_transposed = buffer_base;
649 size_t work_per_multi =
iceildiv(_args.
_Nsize, strategy::out_width());
651 for (
unsigned int multi=(start / work_per_multi); multi<_args.
_nmulti; multi++) {
654 size_t wk_start = multi * work_per_multi;
655 size_t wk_end = (multi + 1) * work_per_multi;
657 assert(wk_end > start);
659 if (wk_start >= end) {
663 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
664 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
667 unsigned int k_size = kmax - k0;
671 size_t n_end = _args.
_Nsize;
674 if (start > wk_start) {
675 n_start = (start - wk_start) * strategy::out_width();
680 n_end = (end - wk_start) * strategy::out_width();
684 Troi *buffer = buffer_base +
685 (
roundup(_args.
_Nsize, strategy::out_width()) * (multi * _Ktotal + k0)) +
686 (n_start *
roundup(k_size, strategy::k_unroll()));
696 const unsigned int rounded_section_size =
roundup(_args.
_Ksize, strategy::k_unroll());
701 for (
unsigned int x0 = n_start; x0 < n_end; x0 += strategy::out_width()) {
702 unsigned int xmax = std::min(x0 + strategy::out_width(), _args.
_Nsize);
705 unsigned int kpos = k0;
706 unsigned int kleft = k_size;
710 unsigned int k_section_base = kpos / rounded_section_size;
712 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
715 unsigned int k_length = std::min(_args.
_Ksize - k_offset, kleft);
717 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
719 (k_section_base * _args.
_Ksize) + k_offset,
720 (k_section_base * _args.
_Ksize) + k_offset + k_length);
723 unsigned int padded_length =
roundup(k_length, strategy::k_unroll());
725 buffer += strategy::out_width() * padded_length;
727 kpos += padded_length;
728 kleft -= padded_length;
733 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
734 n_start, n_end, k0, std::min(kmax, _args.
_Ksize));
742 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
743 _B_transposed =
reinterpret_cast<Troi *
>(buffer_int + get_col_sum_size());
744 _col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
750 template <
typename perf_type>
765 if ((args.
_Nsize < strategy::out_width()) || (args.
_Nsize > strategy::out_width() && args.
_Nsize < 2*strategy::out_width())) {
769 uint64_t total_cycles = mac_cycles;
772 if (std::is_same<OutputStage, Requantize32>::value && SeparateQuantize) {
776 uint64_t rowsum_bytes =
static_cast<uint64_t
>(args.
_nbatches) * args.
_nmulti * args.
_Msize * get_ktotal(args);
790 float requantize_cycles =
static_cast<float>(requantize_bytes) / params.
merge_bytes_cycle;
793 total_cycles = mac_cycles + rowsum_cycles + requantize_cycles;
800 if (std::is_same<OutputStage, Requantize32>::value) {
809 assert(string_len == _args.
_Ksize);
815 _convolver = std::unique_ptr<convolver<To>>(
new convolver<To>(parms));
824 c.
filter = get_type_name<strategy>();
831 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
836 #ifdef __I_DEFINED_UNUSED void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override
Main execute member fucntion.
T roundup(const T a, const T b)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
NDRangeIterator iterator(unsigned int start, unsigned int end) const
size_t get_B_pretransposed_array_size() const override
void set_indirect_parameters(size_t string_len, const To *const *const *ptr) override
WeightFormat get_weight_format(const KernelWeightFormat, size_t)
T iceildiv(const T a, const T b)
void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
void set_convolution_parameters(ConvolutionParameters parms) override
SimpleTensor< T2 > accumulate(const SimpleTensor< T1 > &src, DataType output_data_type)
unsigned int inner_block_size
void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg< T > A_arg, size_t M, int32_t *output_ptr, const Requantize32 *qp)
unsigned int outer_block_size
void set_pretransposed_B_data(void *in_buffer) override
int_t get_position(int_t d) const
static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os={})
WeightFormat weight_format
GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
GemmHybridIndirect(const GemmArgs &args)
void end(TokenStream &in, bool &valid)
const StratType * strategy
void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
bool B_is_pretransposed() const override
NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it in...
unsigned int total_size() const
GemmConfig get_config() override
void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override
void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
size_t get_B_pretranspose_window_size() const override
ndrange_t get_window_size() const override
void pretranspose_B_array_part(void *in_buffer, const To *B, const int ldb, const int B_multi_stride, size_t start, size_t end) override
bool supports_dynamic_scheduling() const override
bool B_pretranspose_required() const override
int_t get_position_end(int_t d) const
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override