Compute Library
 22.11
GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize, FixedFormat > Class Template Reference

#include <gemm_hybrid_indirect.hpp>

Collaboration diagram for GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize, FixedFormat >:
[legend]

Public Member Functions

 GemmHybridIndirect (GemmHybridIndirect &)=delete
 
GemmHybridIndirectoperator= (GemmHybridIndirect &)=delete
 
 GemmHybridIndirect (const GemmArgs &args, const OutputStage &os)
 
 GemmHybridIndirect (const GemmArgs &args)
 
ndrange_t get_window_size () const override
 
bool supports_dynamic_scheduling () const override
 
void execute (const ndcoord_t &work_range, const ndcoord_t &, int) override
 Main execute member fucntion. More...
 
bool B_is_pretransposed () const override
 
bool B_pretranspose_required () const override
 
size_t get_B_pretransposed_array_size () const override
 
void requantize_bias (void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
 
void pretranspose_B_array (void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
 
void set_pretransposed_B_data (void *in_buffer) override
 
void set_quantized_bias (const int32_t *bias, size_t bias_multi_stride) override
 
void set_indirect_parameters (size_t string_len, const To *const *const *ptr) override
 
void set_convolution_parameters (ConvolutionParameters parms) override
 
GemmConfig get_config () override
 
- Public Member Functions inherited from GemmCommon< To, Tr >
virtual void set_arrays (const To *A, const int lda, const int A_batch_stride, const int A_multi_stride, const To *B, const int ldb, const int B_multi_stride, Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride, const Tr *bias, const int bias_multi_stride)
 
void set_arrays_generic (const void *A, const int lda, const int A_batch_stride, const int A_multi_stride, const void *B, const int ldb, const int B_multi_stride, void *C, const int ldc, const int C_batch_stride, const int C_multi_stride, const void *bias, const int bias_multi_stride) override
 
void pretranspose_B_array_generic (void *out, const void *in, const int row_stride, const int multi_stride) override
 
void set_indirect_parameters_generic (size_t sz, const void *const *const *ptr) override
 
- Public Member Functions inherited from IGemmCommon
virtual void set_nthreads (int)
 
virtual size_t get_working_size () const
 
virtual void set_working_space (void *)
 
virtual ~IGemmCommon ()
 

Static Public Member Functions

template<typename perf_type >
static uint64_t estimate_cycles (const GemmArgs &args, const OutputStage &os={})
 

Detailed Description

template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false, bool FixedFormat = false>
class arm_gemm::GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize, FixedFormat >

Definition at line 264 of file gemm_hybrid_indirect.hpp.

Constructor & Destructor Documentation

◆ GemmHybridIndirect() [1/3]

GemmHybridIndirect ( GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize, FixedFormat > &  )
delete

◆ GemmHybridIndirect() [2/3]

GemmHybridIndirect ( const GemmArgs args,
const OutputStage &  os 
)
inline

Definition at line 389 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_cfg.

390  : _args(args), _os(os), _Ktotal(get_ktotal(args)),
391  _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
392  _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
393  _Mround(roundup(args._Msize, strategy::out_height())),
394  _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
395  iceildiv(args._Nsize, _n_block), args._nmulti)
396  {
397  // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
398  // GemmConfig. Clear out the pointer to avoid accidents.
399  _args._cfg = nullptr;
400  }
T roundup(const T a, const T b)
Definition: utils.hpp:70
T iceildiv(const T a, const T b)
Definition: utils.hpp:65
const GemmConfig * _cfg
Definition: arm_gemm.hpp:157

◆ GemmHybridIndirect() [3/3]

GemmHybridIndirect ( const GemmArgs args)
inline

Definition at line 403 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_cfg.

404  : _args(args), _Ktotal(get_ktotal(args)),
405  _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
406  _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
407  _Mround(roundup(args._Msize, strategy::out_height())),
408  _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
409  iceildiv(args._Nsize, _n_block), args._nmulti)
410  {
411  // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
412  // GemmConfig. Clear out the pointer to avoid accidents.
413  _args._cfg = nullptr;
414  }
T roundup(const T a, const T b)
Definition: utils.hpp:70
T iceildiv(const T a, const T b)
Definition: utils.hpp:65
const GemmConfig * _cfg
Definition: arm_gemm.hpp:157

Member Function Documentation

◆ B_is_pretransposed()

bool B_is_pretransposed ( ) const
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 591 of file gemm_hybrid_indirect.hpp.

591  {
592  return (FixedFormat == false);
593  }

◆ B_pretranspose_required()

bool B_pretranspose_required ( ) const
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 595 of file gemm_hybrid_indirect.hpp.

595  {
596  return (FixedFormat == false) && (_B_transposed==nullptr);
597  }

◆ estimate_cycles()

static uint64_t estimate_cycles ( const GemmArgs args,
const OutputStage &  os = {} 
)
inlinestatic

Definition at line 711 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_ci, GemmArgs::_Msize, GemmArgs::_nbatches, GemmArgs::_nmulti, GemmArgs::_Nsize, Requantize32::b_offset, arm_compute::test::validation::if(), PerformanceParameters::kernel_macs_cycle, PerformanceParameters::merge_bytes_cycle, PerformanceParameters::prepare_bytes_cycle, and arm_gemm::roundup().

711  {}) {
712  const PerformanceParameters params = strategy::template get_performance_parameters<perf_type>(args._ci);
713 
714  // Note: Current hybrid kernels don't actually round up height (they
715  // have paths for each possible height). Might need to make this
716  // configurable in future.
717  uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * get_ktotal(args);
718 
719  float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
720 
721  // TODO: A bit of a kludge here: current hybrid kernels incur extra
722  // overhead where the width is not a multiple of kernel width. It's
723  // most noticable where the overall width is quite low, so add 15%
724  // penalty for such widths.
725  if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
726  mac_cycles *= 1.15f;
727  }
728 
729  uint64_t total_cycles = mac_cycles;
730 
731  // Quantizing kernels with separate quantize need to add in the extra stages.
732  if (std::is_same<OutputStage, Requantize32>::value && SeparateQuantize) {
733  const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
734 
735  // Row sums: need to consider each value in A (batch * multi * M * K)...
736  uint64_t rowsum_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * get_ktotal(args);
737 
738  // ... but row sums are skipped if B offset==0.
739  if (qp->b_offset == 0) {
740  rowsum_bytes = 0;
741  }
742 
743  // Use "prepare bytes per cycle" to store "row sum values per cycle".
744  float rowsum_cycles = static_cast<float>(rowsum_bytes) / params.prepare_bytes_cycle;
745 
746  // Requantize: need to consider each value in C (batch * multi * M * N)
747  uint64_t requantize_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * args._Nsize;
748 
749  // Use "merge bytes per cycle" to store "requantize values per cycle".
750  float requantize_cycles = static_cast<float>(requantize_bytes) / params.merge_bytes_cycle;
751 
752  // Recalculate total_cycles with the extra components.
753  total_cycles = mac_cycles + rowsum_cycles + requantize_cycles;
754  }
755 
756  return total_cycles;
757  }
T roundup(const T a, const T b)
Definition: utils.hpp:70

◆ execute()

void execute ( const ndcoord_t work_range,
const ndcoord_t thread_locator,
int  threadid 
)
inlineoverridevirtual

Main execute member fucntion.

Parameters
[in]work_rangespecifies the range of work we want to be computed, total range defined by get_window_size()
[in]thread_locatorwhere are we inside of the thread space
[in]threadida unique threadid

Implements IGemmCommon.

Definition at line 427 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_act, GemmArgs::_ci, GemmArgs::_indirect_input, GemmArgs::_Ksections, GemmArgs::_Ksize, GemmArgs::_Msize, GemmArgs::_nbatches, GemmArgs::_Nsize, arm_compute::test::validation::batch, NDCoordinate< N >::get_position(), NDCoordinate< N >::get_position_end(), NDRange< D >::iterator(), offset(), arm_gemm::roundup(), arm_compute::test::validation::run(), sections, and strategy.

427  {
428 #ifdef CYCLE_PROFILING
429  profiler prof;
430 #endif
431  strategy strat(_args._ci);
432 
433  std::vector<const To *> in_row_ptrs;
434  std::vector<const To * const *> in_row_strings;
435  std::vector<unsigned int> string_lengths;
436 
437  // In convolution mode, we need input pointers.
438  if (_convolver) {
439  in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
440  in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
441 
442  for (unsigned int i=0; i<_args._Ksections; i++) {
443  in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
444  }
445  }
446 
447  // In any indirect mode, we need the string lengths.
448  if (_args._indirect_input) {
449  string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
450  }
451 
452  /* Make sure we've been set up correctly. */
453  assert(FixedFormat || _B_transposed);
454  static_assert(std::is_same<To, Tloi>::value, "gemm_native: Operand types must be the same.");
455 // static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
456 
457  /* For now, each work item implies all the K for a given output
458  * pixel (so we don't need to synchronize access to the output
459  * array). So separate the loop over K blocks here. */
460  for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
461  unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
462  unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
463 
464  const bool first_pass = (k0 == 0);
465  const bool last_pass = (kmax == _Ktotal);
466 
467  unsigned int first_section = (k0 / _rounded_Ksize);
468  unsigned int first_offset = (k0 % _rounded_Ksize);
469  unsigned int kleft = kern_k;
470  unsigned int sections=0;
471  unsigned int offset = first_offset;
472 
473  if (_args._indirect_input) {
474  while (kleft) {
475  // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
476  // processed (excluding padding). But the amount we subtract from 'kleft' takes account of any
477  // padding applied.
478  string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
479  kleft -= std::min(kleft, _rounded_Ksize - offset);
480  sections++;
481  offset=0;
482  }
483  }
484 
485  auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
486 
487  if (p.done()) {
488  return;
489  }
490 
491  // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
492  // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
493  // THe convolution path only generates the pointers for one block of rows at a time.
494  const bool process_all_rows = (!SeparateQuantize && !_convolver);
495 
496  do {
497  const unsigned int m_start = p.dim(0) * strategy::out_height();
498  const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
499 // const unsigned int m_end = std::min(m_start + strategy::out_height(), _args._Msize);
500  const unsigned int batch = p.dim(1);
501  const unsigned int n0 = p.dim(2) * _n_block;
502  const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize);
503  const unsigned int multi = p.dim(3);
504 
505  const Troi *b_panel;
506  if (FixedFormat) {
507  b_panel = reinterpret_cast<const Troi *>(this->_Bptr) +
508  (multi * this->_B_multi_stride) +
509  ((n0 / stripe_width<strategy, FixedFormat>::get()) * this->_ldb) +
510  (k0 * stripe_width<strategy, FixedFormat>::get());
511  } else {
512  b_panel = _B_transposed +
513  (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
514  (k0 * roundup(_args._Nsize, strategy::out_width())) +
515  (n0 * kern_k);
516  }
517 
518  IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
519 
520 #ifdef CYCLE_PROFILING
521  auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
522 #endif
523  if (_indirect_buf) {
525 #ifdef CYCLE_PROFILING
526  prof,
527 #endif
528  strat, sections, string_lengths.data(),
529  IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
530  (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
531  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
532  last_pass ? _args._act : Activation(),
533  !first_pass,
534  // Quantization parameters
535  _os, _col_bias+(multi * _args._Nsize), n0);
536  } else if (_convolver) {
537  auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
538 
539  unsigned int pos=0;
540  auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
541 
542  while (!conv_rows.finished()) {
543  unsigned int width, conv_offset;
544 
545  assert(pos < sections);
546 
547  std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
548 
549  if (pos==0) {
550  assert(conv_offset == first_offset);
551  }
552  assert(width == string_lengths[pos]);
553  pos++;
554  }
555  assert(pos == sections);
556 
558 #ifdef CYCLE_PROFILING
559  prof,
560 #endif
561  strat, sections, string_lengths.data(),
562  IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
563  (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
564  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
565  last_pass ? _args._act : Activation(),
566  !first_pass,
567  // Quantization parameters
568  _os, _col_bias+(multi * _args._Nsize), n0);
569  } else {
570  // Length to process. This needs to exclude padding, but 'kmax' potentially includes it.
571  const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
572 
574 #ifdef CYCLE_PROFILING
575  prof,
576 #endif
577  strat, 1, &len,
578  IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
579  (m_end - m_start), (nmax - n0), kern_k, b_panel, this->_ldb, out_arg,
580  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
581  last_pass ? _args._act : Activation(),
582  !first_pass,
583  // Quantization parameters
584  _os, _col_bias+(multi * _args._Nsize), n0);
585  }
586  } while (process_all_rows ? p.next_dim1() : p.next_dim0());
587  }
588  }
T roundup(const T a, const T b)
Definition: utils.hpp:70
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1084
const CPUInfo * _ci
Definition: arm_gemm.hpp:145
NDRangeIterator iterator(unsigned int start, unsigned int end) const
Definition: ndrange.hpp:131
unsigned int _Nsize
Definition: arm_gemm.hpp:147
Activation _act
Definition: arm_gemm.hpp:153
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
Definition: Types.h:73
const StratType * strategy
unsigned int _Msize
Definition: arm_gemm.hpp:146
unsigned int sections
unsigned int _Ksections
Definition: arm_gemm.hpp:149
unsigned int _Ksize
Definition: arm_gemm.hpp:148
unsigned int _nbatches
Definition: arm_gemm.hpp:150

◆ get_B_pretransposed_array_size()

size_t get_B_pretransposed_array_size ( ) const
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 599 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_Msize, GemmArgs::_nbatches, GemmArgs::_nmulti, GemmArgs::_Nsize, and arm_gemm::roundup().

599  {
600  if (FixedFormat) {
601  return 0;
602  }
603 
604  // Start with actual pretransposed buffer...
605  size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Troi);
606 
607  // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
608  size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
609 
610  if (std::is_same<OutputStage, Requantize32>::value) {
611  size += get_col_sum_size();
612  }
613 
614  return size;
615  }
T roundup(const T a, const T b)
Definition: utils.hpp:70
unsigned int _nmulti
Definition: arm_gemm.hpp:151
unsigned int _Nsize
Definition: arm_gemm.hpp:147
unsigned int _Msize
Definition: arm_gemm.hpp:146
unsigned int _nbatches
Definition: arm_gemm.hpp:150

◆ get_config()

GemmConfig get_config ( )
inlineoverridevirtual

Implements IGemmCommon.

Definition at line 778 of file gemm_hybrid_indirect.hpp.

References GemmConfig::filter, arm_gemm::GEMM_HYBRID, arm_gemm::get_weight_format(), GemmConfig::inner_block_size, GemmConfig::method, GemmConfig::outer_block_size, and GemmConfig::weight_format.

778  {
779  GemmConfig c;
780 
781  c.method = GemmMethod::GEMM_HYBRID;
782  c.inner_block_size = _k_block;
783  c.outer_block_size = _n_block;
784  c.filter = get_type_name<strategy>();
785  c.weight_format = get_weight_format(kernel_weight_format<strategy, FixedFormat>::get(), sizeof(To));
786 
787  return c;
788  }
WeightFormat get_weight_format(const KernelWeightFormat, size_t)
Definition: misc.cpp:40

◆ get_window_size()

ndrange_t get_window_size ( ) const
inlineoverridevirtual
Returns
an ndrange containing ranges of the compute space which can be broken up and parallelised over

Implements IGemmCommon.

Definition at line 417 of file gemm_hybrid_indirect.hpp.

References NDRange< D >::total_size().

417  {
418  return { _window_range.total_size() };
419  }
unsigned int total_size() const
Definition: ndrange.hpp:136

◆ operator=()

GemmHybridIndirect& operator= ( GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize, FixedFormat > &  )
delete

◆ pretranspose_B_array()

void pretranspose_B_array ( void *  in_buffer,
const To *  B,
const int  ldb,
const int  B_multi_stride 
)
inlineoverridevirtual

Reimplemented from GemmCommon< To, Tr >.

Definition at line 630 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_ci, GemmArgs::_Ksections, GemmArgs::_Ksize, GemmArgs::_nmulti, GemmArgs::_Nsize, arm_gemm::roundup(), and strategy.

630  {
631  requantize_bias(in_buffer, B, ldb, B_multi_stride);
632 
633  // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
634  uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
635  Troi *buffer = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
636  _B_transposed = buffer;
637 
638  strategy strat(_args._ci);
639 
640  for (unsigned int multi=0; multi<_args._nmulti; multi++) {
641  for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
642  const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
643 
644  /* Figure out the size of each block. */
645  unsigned int k_size = kmax - k0;
646 
647  if (_args._Ksections > 1) {
648  // We need to insert padding at the end of each K section.
649  // The computation needed is a little delicate - the coordinates from the block walker are expressed in
650  // terms of the full, padded, _Ktotal.
651  // But we need to transform each section with reference to the original, unpadded, input, letting the
652  // transform pad each section as needed.
653 
654  // This is needed for computations below.
655  const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
656 
657  // The expected output format is also an entire <out_width> columns interleaved, then the next set of
658  // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
659  // a time.
660  for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
661  unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
662 
663  // Track where we are and how much work is left.
664  unsigned int kpos = k0;
665  unsigned int kleft = k_size;
666 
667  while (kleft) {
668  // Which section are we in? Based on the rounded-up section size.
669  unsigned int k_section_base = kpos / rounded_section_size;
670  // How far into the section are we?
671  unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
672 
673  // We will either copy the rest of this section, or to the end of the requested length.
674  unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
675 
676  strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
677  x0, xmax,
678  (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
679  (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
680 
681  // We need to modify our position based on the ROUNDED version of what we just did.
682  unsigned int padded_length = roundup(k_length, strategy::k_unroll());
683 
684  buffer += strategy::out_width() * padded_length;
685 
686  kpos += padded_length;
687  kleft -= padded_length;
688  }
689  }
690  } else {
691  // In the single K section case, can process the whole lot in one go.
692  strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
693  0, _args._Nsize, k0, std::min(kmax, _args._Ksize));
694  buffer += roundup(_args._Nsize, strategy::out_width()) * roundup(kmax-k0, strategy::k_unroll());
695  }
696  }
697  }
698  }
T roundup(const T a, const T b)
Definition: utils.hpp:70
const CPUInfo * _ci
Definition: arm_gemm.hpp:145
unsigned int _nmulti
Definition: arm_gemm.hpp:151
unsigned int _Nsize
Definition: arm_gemm.hpp:147
void requantize_bias(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
const StratType * strategy
unsigned int _Ksections
Definition: arm_gemm.hpp:149
unsigned int _Ksize
Definition: arm_gemm.hpp:148

◆ requantize_bias()

void requantize_bias ( void *  in_buffer,
const To *  B,
const int  ldb,
const int  B_multi_stride 
)
inlineoverridevirtual

Reimplemented from GemmCommon< To, Tr >.

Definition at line 617 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_Ksections, GemmArgs::_Ksize, GemmArgs::_nmulti, GemmArgs::_Nsize, and arm_gemm::compute_col_sums().

617  {
618  if (std::is_same<OutputStage, Requantize32>::value) {
619  _col_bias = reinterpret_cast<int32_t *>(in_buffer);
620 
621  Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
622 
623  for (unsigned int i=0; i<_args._nmulti; i++) {
624  // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
625  compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
626  }
627  }
628  }
unsigned int _nmulti
Definition: arm_gemm.hpp:151
unsigned int _Nsize
Definition: arm_gemm.hpp:147
void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
unsigned int _Ksections
Definition: arm_gemm.hpp:149
unsigned int _Ksize
Definition: arm_gemm.hpp:148

◆ set_convolution_parameters()

void set_convolution_parameters ( ConvolutionParameters  parms)
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 773 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_Ksize, and ConvolutionParameters::input_channels.

773  {
774  assert(parms.input_channels == _args._Ksize);
775  _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
776  }
unsigned int _Ksize
Definition: arm_gemm.hpp:148

◆ set_indirect_parameters()

void set_indirect_parameters ( size_t  string_len,
const To *const *const *  ptr 
)
inlineoverridevirtual

Reimplemented from GemmCommon< To, Tr >.

Definition at line 768 of file gemm_hybrid_indirect.hpp.

References GemmArgs::_Ksize.

768  {
769  assert(string_len == _args._Ksize);
770  _indirect_buf = ptr;
771  }
unsigned int _Ksize
Definition: arm_gemm.hpp:148

◆ set_pretransposed_B_data()

void set_pretransposed_B_data ( void *  in_buffer)
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 700 of file gemm_hybrid_indirect.hpp.

700  {
701  // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
702  uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
703  _B_transposed = reinterpret_cast<Troi *>(buffer_int + get_col_sum_size());
704  _col_bias = reinterpret_cast<int32_t *>(in_buffer);
705  }

◆ set_quantized_bias()

void set_quantized_bias ( const int32_t *  bias,
size_t  bias_multi_stride 
)
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 759 of file gemm_hybrid_indirect.hpp.

References Requantize32::bias, bias, and Requantize32::bias_multi_stride.

759  {
760  if (std::is_same<OutputStage, Requantize32>::value) {
761  Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
762 
763  qp->bias = bias;
764  qp->bias_multi_stride = bias_multi_stride;
765  }
766  }
const int32_t * bias

◆ supports_dynamic_scheduling()

bool supports_dynamic_scheduling ( ) const
inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 422 of file gemm_hybrid_indirect.hpp.

422  {
423  return true;
424  }

The documentation for this class was generated from the following file: