26 #if !defined(_WIN64) && !defined(__OpenBSD__) 41 #ifdef CYCLE_PROFILING 42 #include "profiler.hpp" 46 #define __I_DEFINED_UNUSED 47 #define UNUSED(x) ((void)(x)) 57 template<
typename OutputStage,
bool SeparateQuantize = false>
58 class run_hybrid_kernel {
60 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
61 static inline void run (
62 #ifdef CYCLE_PROFILING
65 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
66 unsigned int kern_k,
const Tro *b_ptr, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr, Activation act,
bool accumulate,
67 const OutputStage &os,
const int32_t *col_bias,
unsigned int n_0 );
71 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
73 #ifdef CYCLE_PROFILING
76 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
77 unsigned int kern_k,
const Tro *b_ptr, IndirectOutputArg<Tr> output_arg,
const Tr *bias_ptr, Activation act,
bool accumulate,
78 const Nothing &,
const int32_t *,
unsigned int) {
79 #ifdef CYCLE_PROFILING 80 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
86 if (bias_ptr && !accumulate && (N % strategy::out_width() != 0)) {
88 unsigned int N_remainder = N % strategy::out_width();
89 unsigned int N_bulk = N - N_remainder;
92 IndirectOutputArg<Tr> offset_output = output_arg;
96 strat.kernel(num_strings, string_ptr, A_arg, M, N_bulk, b_ptr, output_arg, bias_ptr, act, accumulate);
98 if (output_arg.is_indirect) {
99 offset_output = IndirectOutputArg<Tr>(output_arg.indirect.ptr, output_arg.indirect.offset + N_bulk);
101 offset_output = IndirectOutputArg<Tr>(output_arg.direct.base + N_bulk, output_arg.direct.stride);
106 Tr *bias_pad_buffer =
reinterpret_cast<Tr *
>(alloca(strategy::out_width() *
sizeof(Tr)));
107 memcpy(bias_pad_buffer, bias_ptr + N_bulk, N_remainder *
sizeof(Tr));
110 strat.kernel(num_strings, string_ptr, A_arg, M, N_remainder, b_ptr + (N_bulk * kern_k), offset_output, bias_pad_buffer, act, accumulate);
112 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
117 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
119 #ifdef CYCLE_PROFILING
122 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
123 unsigned int kern_k,
const Tro *b_ptr, IndirectOutputArg<Tr> output_arg,
const Tr *, Activation,
bool,
124 const Requantize32 &os,
const int32_t *col_bias,
unsigned int n_0 ) {
125 #ifdef CYCLE_PROFILING 126 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
130 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
134 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
136 #ifdef CYCLE_PROFILING
139 const strategy &strat,
unsigned int num_strings,
const unsigned int *string_ptr, IndirectInputArg<Tlo> A_arg,
unsigned int M,
unsigned int N,
140 unsigned int kern_k,
const Tro *b_ptr, IndirectOutputArg<Tr> output_arg,
const Tr *, Activation,
bool,
141 const Requantize32 &os,
const int32_t *col_bias,
unsigned int n_0 ) {
144 assert(M <= strategy::out_height());
146 assert(output_arg.is_indirect ==
false);
150 int32_t row_sums[strategy::out_height()];
151 typename strategy::result_type *result_buffer;
153 unsigned int output_width =
roundup(N, strategy::out_width());
155 result_buffer =
reinterpret_cast<typename strategy::result_type *
>(alloca(output_width * strategy::out_height() *
sizeof(
typename strategy::result_type)));
158 #ifdef CYCLE_PROFILING 159 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)M * kern_k *
roundup(N, strategy::out_width()));
162 strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width),
nullptr, Activation(),
false);
165 if (os.b_offset != 0) {
166 #ifdef CYCLE_PROFILING 167 auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (
unsigned long)M * kern_k);
171 memset(row_sums, 0,
sizeof(int32_t) * strategy::out_height());
175 #ifdef CYCLE_PROFILING 176 auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (
unsigned long)M * N);
179 requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
186 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage = Nothing,
bool SeparateQuantize = false>
188 typedef typename strategy::lhs_operand_type Tloi;
189 typedef typename strategy::rhs_operand_type Troi;
190 typedef typename strategy::result_type Tri;
193 OutputStage _os = {};
196 int32_t *_col_bias =
nullptr;
198 const unsigned int _Ktotal;
199 const unsigned int _rounded_Ksize;
202 const unsigned int _k_block;
203 const unsigned int _n_block;
204 const unsigned int _Mround;
207 const Troi *_B_transposed=
nullptr;
210 const To *
const *
const * _indirect_buf =
nullptr;
213 std::unique_ptr<convolver<To>> _convolver =
nullptr;
220 unsigned int get_col_sum_size()
const {
221 if (std::is_same<OutputStage, Requantize32>::value) {
232 static unsigned int compute_k_block(
const GemmArgs &args) {
234 if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
235 return get_ktotal(args);
244 unsigned int target_block_size = 2048 /
sizeof(To);
245 auto ktotal = get_ktotal(args);
247 if (ktotal > ((target_block_size*3)/2)) {
248 unsigned int target_blocks =
iceildiv(ktotal, target_block_size);
250 unsigned int block_size =
iceildiv(ktotal, target_blocks);
252 block_size =
roundup(block_size, strategy::k_unroll());
262 static unsigned int compute_n_block(
const GemmArgs &args,
const OutputStage os = {}) {
278 if (std::is_same<OutputStage, Requantize32>::value) {
292 return roundup(n_block, strategy::out_width());
301 return strategy::out_width() * 3;
304 return strategy::out_width();
313 : _args(args), _os(os), _Ktotal(get_ktotal(args)),
315 _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
317 _window_range(
iceildiv(args._Msize,
strategy::out_height()), args._nbatches,
318 iceildiv(args._Nsize, _n_block), args._nmulti)
322 _args.
_cfg =
nullptr;
327 : _args(args), _Ktotal(get_ktotal(args)),
329 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
331 _window_range(
iceildiv(args._Msize,
strategy::out_height()), args._nbatches,
332 iceildiv(args._Nsize, _n_block), args._nmulti)
336 _args.
_cfg =
nullptr;
351 #ifdef CYCLE_PROFILING 356 std::vector<const To *> in_row_ptrs;
357 std::vector<const To * const *> in_row_strings;
358 std::vector<unsigned int> string_lengths;
362 in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args.
_Ksections,
nullptr);
363 in_row_strings = std::vector<const To * const *>(_args.
_Ksections,
nullptr);
365 for (
unsigned int i=0; i<_args.
_Ksections; i++) {
366 in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
372 string_lengths = std::vector<unsigned int>(_args.
_Ksections, 0);
376 assert(_B_transposed);
377 static_assert(std::is_same<To, Tloi>::value,
"gemm_native: Operand types must be the same.");
383 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
384 unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
385 unsigned int kern_k =
roundup(kmax-k0, strategy::k_unroll());
387 const bool first_pass = (k0 == 0);
388 const bool last_pass = (kmax == _Ktotal);
390 unsigned int first_section = (k0 / _rounded_Ksize);
391 unsigned int first_offset = (k0 % _rounded_Ksize);
392 unsigned int kleft = kern_k;
394 unsigned int offset = first_offset;
401 string_lengths[
sections] = std::min(kleft, _args.
_Ksize - offset);
402 kleft -= std::min(kleft, _rounded_Ksize - offset);
417 const bool process_all_rows = (!SeparateQuantize && !_convolver);
420 const unsigned int m_start = p.dim(0) * strategy::out_height();
421 const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args.
_Msize) : std::min(m_start + strategy::out_height(), _args.
_Msize);
423 const unsigned int batch = p.dim(1);
424 const unsigned int n0 = p.dim(2) * _n_block;
425 const unsigned int nmax = std::min(n0 + _n_block, _args.
_Nsize);
426 const unsigned int multi = p.dim(3);
428 const Troi *b_panel = _B_transposed +
429 (multi *
roundup(_args.
_Nsize, strategy::out_width()) * _Ktotal) +
433 IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
435 #ifdef CYCLE_PROFILING 436 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(m_end - m_start) * kern_k *
roundup(nmax-n0, strategy::out_width()));
440 #ifdef CYCLE_PROFILING
443 strat, sections, string_lengths.data(),
445 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
446 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
450 _os, _col_bias+(multi * _args.
_Nsize), n0);
451 }
else if (_convolver) {
452 auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
455 auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
457 while (!conv_rows.finished()) {
458 unsigned int width, conv_offset;
460 assert(pos < sections);
462 std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
465 assert(conv_offset == first_offset);
467 assert(width == string_lengths[pos]);
470 assert(pos == sections);
473 #ifdef CYCLE_PROFILING
476 strat, sections, string_lengths.data(),
478 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
479 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
483 _os, _col_bias+(multi * _args.
_Nsize), n0);
486 const unsigned int len = (std::min(_args.
_Ksize, kmax) - k0);
489 #ifdef CYCLE_PROFILING
493 IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
494 (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
495 (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
499 _os, _col_bias+(multi * _args.
_Nsize), n0);
501 }
while (process_all_rows ? p.next_dim1() : p.next_dim0());
511 return (_B_transposed==
nullptr);
516 size_t size =
roundup(_args.
_Nsize, strategy::out_width()) * _Ktotal * _args.
_nmulti *
sizeof(Troi);
521 if (std::is_same<OutputStage, Requantize32>::value) {
522 size += get_col_sum_size();
528 void requantize_bias(
void *in_buffer,
const To *B,
const int ldb,
const int B_multi_stride)
override {
529 if (std::is_same<OutputStage, Requantize32>::value) {
530 _col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
534 for (
unsigned int i=0; i<_args.
_nmulti; i++) {
542 requantize_bias(in_buffer, B, ldb, B_multi_stride);
545 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
546 Troi *buffer =
reinterpret_cast<Troi *
>(buffer_int + get_col_sum_size());
547 _B_transposed = buffer;
551 for (
unsigned int multi=0; multi<_args.
_nmulti; multi++) {
552 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
553 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
556 unsigned int k_size = kmax - k0;
566 const unsigned int rounded_section_size =
roundup(_args.
_Ksize, strategy::k_unroll());
571 for (
unsigned int x0=0; x0 < _args.
_Nsize; x0 += strategy::out_width() ){
572 unsigned int xmax = std::min(x0 + strategy::out_width(), _args.
_Nsize);
575 unsigned int kpos = k0;
576 unsigned int kleft = k_size;
580 unsigned int k_section_base = kpos / rounded_section_size;
582 unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
585 unsigned int k_length = std::min(_args.
_Ksize - k_offset, kleft);
587 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
589 (k_section_base * _args.
_Ksize) + k_offset,
590 (k_section_base * _args.
_Ksize) + k_offset + k_length);
593 unsigned int padded_length =
roundup(k_length, strategy::k_unroll());
595 buffer += strategy::out_width() * padded_length;
597 kpos += padded_length;
598 kleft -= padded_length;
604 strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
606 buffer +=
roundup(_args.
_Nsize, strategy::out_width()) *
roundup(kmax-k0, strategy::k_unroll());
614 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
615 _B_transposed =
reinterpret_cast<Troi *
>(buffer_int + get_col_sum_size());
616 _col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
622 template <
typename perf_type>
637 if ((args.
_Nsize < strategy::out_width()) || (args.
_Nsize > strategy::out_width() && args.
_Nsize < 2*strategy::out_width())) {
641 uint64_t total_cycles = mac_cycles;
644 if (std::is_same<OutputStage, Requantize32>::value && SeparateQuantize) {
648 uint64_t rowsum_bytes =
static_cast<uint64_t
>(args.
_nbatches) * args.
_nmulti * args.
_Msize * get_ktotal(args);
662 float requantize_cycles =
static_cast<float>(requantize_bytes) / params.
merge_bytes_cycle;
665 total_cycles = mac_cycles + rowsum_cycles + requantize_cycles;
672 if (std::is_same<OutputStage, Requantize32>::value) {
681 assert(string_len == _args.
_Ksize);
687 _convolver = std::unique_ptr<convolver<To>>(
new convolver<To>(parms));
696 c.
filter = get_type_name<strategy>();
704 #ifdef __I_DEFINED_UNUSED void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override
Main execute member fucntion.
T roundup(const T a, const T b)
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
NDRangeIterator iterator(unsigned int start, unsigned int end) const
size_t get_B_pretransposed_array_size() const override
void set_indirect_parameters(size_t string_len, const To *const *const *ptr) override
T iceildiv(const T a, const T b)
void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
void set_convolution_parameters(ConvolutionParameters parms) override
SimpleTensor< T2 > accumulate(const SimpleTensor< T1 > &src, DataType output_data_type)
unsigned int inner_block_size
void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg< T > A_arg, size_t M, int32_t *output_ptr, const Requantize32 *qp)
unsigned int outer_block_size
void set_pretransposed_B_data(void *in_buffer) override
int_t get_position(int_t d) const
static uint64_t estimate_cycles(const GemmArgs &args, const OutputStage &os={})
GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
GemmHybridIndirect(const GemmArgs &args)
const StratType * strategy
void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
bool B_is_pretransposed() const override
NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it in...
unsigned int total_size() const
GemmConfig get_config() override
void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override
void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
ndrange_t get_window_size() const override
bool supports_dynamic_scheduling() const override
bool B_pretranspose_required() const override
int_t get_position_end(int_t d) const
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override