#include <gemm_hybrid.hpp>

Collaboration diagram for GemmHybrid< strategy, To, Tr >:

Public Member Functions
	GemmHybrid (GemmHybrid &)=delete

GemmHybrid &	operator= (GemmHybrid &)=delete

	GemmHybrid (const GemmArgs &args)

ndrange_t	get_window_size () const override

bool	supports_dynamic_scheduling () const override

void	execute (const ndcoord_t &work_range, const ndcoord_t &, int) override
	Main execute member fucntion. More...

bool	B_is_pretransposed () const override

bool	B_pretranspose_required () const override

size_t	get_B_pretransposed_array_size () const override

void	pretranspose_B_array (void in_buffer, const To B, const int ldb, const int B_multi_stride) override

void	set_pretransposed_B_data (void *in_buffer) override

GemmConfig	get_config () override

Public Member Functions inherited from GemmCommon< To, Tr >
virtual void	set_arrays (const To A, const int lda, const int A_batch_stride, const int A_multi_stride, const To B, const int ldb, const int B_multi_stride, Tr C, const int ldc, const int C_batch_stride, const int C_multi_stride, const Tr bias, const int bias_multi_stride)

void	set_arrays_generic (const void A, const int lda, const int A_batch_stride, const int A_multi_stride, const void B, const int ldb, const int B_multi_stride, void C, const int ldc, const int C_batch_stride, const int C_multi_stride, const void bias, const int bias_multi_stride) override

virtual void	requantize_bias (void , const To , const int, const int)

void	pretranspose_B_array_generic (void out, const void in, const int row_stride, const int multi_stride) override

virtual void	pretranspose_B_array_part (void out, const To in, const int row_stride, const int multi_stride, size_t, size_t)

void	pretranspose_B_array_part_generic (void out, const void in, const int row_stride, const int multi_stride, size_t start, size_t end) override

virtual void	set_indirect_parameters (size_t, const To const const *)

void	set_indirect_parameters_generic (size_t sz, const void const const *ptr) override

Public Member Functions inherited from IGemmCommon
virtual void	set_nthreads (int)

virtual size_t	get_working_size () const

virtual void	set_working_space (void *)

virtual size_t	get_B_pretranspose_window_size () const

virtual void	set_quantized_bias (const int32_t *, size_t)

virtual void	set_convolution_parameters (ConvolutionParameters)

virtual	~IGemmCommon ()

Static Public Member Functions
static uint64_t	estimate_cycles (const GemmArgs &args, const PerformanceParameters &params)

Detailed Description

template<typename strategy, typename To, typename Tr>
class arm_gemm::GemmHybrid< strategy, To, Tr >

Definition at line 44 of file gemm_hybrid.hpp.

Constructor & Destructor Documentation

◆ GemmHybrid() [1/2]

GemmHybrid ( GemmHybrid< strategy, To, Tr > & )

delete

◆ GemmHybrid() [2/2]

GemmHybrid ( const GemmArgs & args )

inline

Definition at line 130 of file gemm_hybrid.hpp.

               : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
                 _nbatches(args._nbatches), _nmulti(args._nmulti),
                 _act(args._act),
                 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
                 _Mround(roundup(args._Msize, strategy::out_height())),
                 _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }

Member Function Documentation

◆ B_is_pretransposed()

bool B_is_pretransposed ( ) const

inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 212 of file gemm_hybrid.hpp.

                                              {
         return true;
     }

◆ B_pretranspose_required()

bool B_pretranspose_required ( ) const

inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 216 of file gemm_hybrid.hpp.

                                                   {
         return (_B_transposed==nullptr);
     }

◆ estimate_cycles()

static uint64_t estimate_cycles	(	const GemmArgs &	args,
		const PerformanceParameters &	params
	)

inlinestatic

Definition at line 253 of file gemm_hybrid.hpp.

                                                                                                {
         // Note: Current hybrid kernels don't actually round up height (they
         // have paths for each possible height).  Might need to make this
         // configurable in future.
         uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
  
         float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
  
         // TODO: A bit of a kludge here: current hybrid kernels incur extra
         // overhead where the width is not a multiple of kernel width.  It's
         // most noticable where the overall width is quite low, so add 15%
         // penalty for such widths.
         if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
             mac_cycles *= 1.15f;
         }
  
         uint64_t total_cycles = mac_cycles;
  
         return total_cycles;
     }

References GemmTuner::args, PerformanceParameters::kernel_macs_cycle, and arm_gemm::roundup().

◆ execute()

void execute	(	const ndcoord_t &	work_range,
		const ndcoord_t &	thread_locator,
		int	threadid
	)

inlineoverridevirtual

Main execute member fucntion.

Parameters

[in]	work_range	specifies the range of work we want to be computed, total range defined by get_window_size()
[in]	thread_locator	where are we inside of the thread space
[in]	threadid	a unique threadid

Implements IGemmCommon.

Definition at line 149 of file gemm_hybrid.hpp.

                                                                                {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
         strategy strat(_ci);
  
         /* Make sure we've been set up correctly. */
         assert(_B_transposed);
         static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
         static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
  
         /* For now, each work item implies all the K for a given output
          * pixel (so we don't need to synchronize access to the output
          * array).  So separate the loop over K blocks here.  */
         for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
             unsigned int kmax   = std::min(k0 + _k_block, _Ksize);
             unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
  
             const bool first_pass = (k0 == 0);
             const bool last_pass = (kmax == _Ksize);
  
             auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
  
             if (p.done()) {
                 return;
             }
  
             do {
                 const unsigned int m_start = p.dim(0) * strategy::out_height();
                 const unsigned int m_end   = std::min(p.dim0_max() * strategy::out_height(), _Msize);
                 const unsigned int batch   = p.dim(1);
                 const unsigned int n0      = p.dim(2) * _n_block;
                 const unsigned int nmax    = std::min(n0 + _n_block, _Nsize);
                 const unsigned int multi   = p.dim(3);
  
                 const Toi *b_panel = _B_transposed +
                                      (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
                                      (k0 * roundup(_Nsize, strategy::out_width())) +
                                      (n0 * kern_k);
  
 #ifdef CYCLE_PROFILING
                 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
 #endif
  
                 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
                              b_panel,
                              this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
                              (m_end - m_start), (nmax - n0), kmax-k0,
                              (strategy::supports_bias() && first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
                              last_pass ? _act : Activation(), !first_pass);
  
                 // Add bias externally if needed
                 if (!strategy::supports_bias() && this->_bias && first_pass) {
                     bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
                                this->_bias + (multi * this->_bias_multi_stride) + n0,
                                (m_end - m_start), (nmax - n0));
                 }
  
             } while (p.next_dim1());
         }
     }

References arm_gemm::bias_adder(), NDCoordinate< N >::get_position(), NDCoordinate< N >::get_position_end(), NDRange< D >::iterator(), arm_gemm::roundup(), and strategy.

◆ get_B_pretransposed_array_size()

size_t get_B_pretransposed_array_size ( ) const

inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 220 of file gemm_hybrid.hpp.

                                                            {
         return roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi);
     }

References arm_gemm::roundup().

◆ get_config()

GemmConfig get_config ( )

inlineoverridevirtual

Implements IGemmCommon.

Definition at line 274 of file gemm_hybrid.hpp.

                                      {
         GemmConfig c;
  
         c.method = GemmMethod::GEMM_HYBRID;
         c.inner_block_size = _k_block;
         c.outer_block_size = _n_block;
         c.filter = get_type_name<strategy>();
  
         return c;
     }

References GemmConfig::filter, arm_gemm::GEMM_HYBRID, GemmConfig::inner_block_size, GemmConfig::method, and GemmConfig::outer_block_size.

◆ get_window_size()

ndrange_t get_window_size ( ) const

inlineoverridevirtual

Returns: an ndrange containing ranges of the compute space which can be broken up and parallelised over

Implements IGemmCommon.

Definition at line 139 of file gemm_hybrid.hpp.

                                                {
         return { _window_range.total_size() };
     }

References NDRange< D >::total_size().

◆ operator=()

GemmHybrid& operator= ( GemmHybrid< strategy, To, Tr > & )

delete

◆ pretranspose_B_array()

void pretranspose_B_array	(	void *	in_buffer,
		const To *	B,
		const int	ldb,
		const int	B_multi_stride
	)

inlineoverridevirtual

Reimplemented from GemmCommon< To, Tr >.

Definition at line 224 of file gemm_hybrid.hpp.

                                                                                                               {
         Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
         _B_transposed = buffer;
         strategy strat(_ci);
  
         for (unsigned int multi=0; multi<_nmulti; multi++) {
             for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
                 const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
                 const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
  
                 for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
                     const unsigned int xmax = std::min(x0+_n_block, _Nsize);
  
                     const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
  
                     strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
                                                x0, xmax, k0, kmax);
  
                     buffer += size;
                 }
             }
         }
     }

References arm_gemm::roundup(), and strategy.

◆ set_pretransposed_B_data()

void set_pretransposed_B_data ( void * in_buffer )

inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 248 of file gemm_hybrid.hpp.

                                                             {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }

◆ supports_dynamic_scheduling()

bool supports_dynamic_scheduling ( ) const

inlineoverridevirtual

Reimplemented from IGemmCommon.

Definition at line 144 of file gemm_hybrid.hpp.

                                                       {
         return true;
     }

The documentation for this class was generated from the following file:

src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp

Public Member Functions

Static Public Member Functions

Detailed Description

template<typename strategy, typename To, typename Tr> class arm_gemm::GemmHybrid< strategy, To, Tr >

Constructor & Destructor Documentation

◆ GemmHybrid() [1/2]

◆ GemmHybrid() [2/2]

Member Function Documentation

◆ B_is_pretransposed()

◆ B_pretranspose_required()

◆ estimate_cycles()

◆ execute()

◆ get_B_pretransposed_array_size()

◆ get_config()

◆ get_window_size()

◆ operator=()

◆ pretranspose_B_array()

◆ set_pretransposed_B_data()

◆ supports_dynamic_scheduling()

template<typename strategy, typename To, typename Tr>
class arm_gemm::GemmHybrid< strategy, To, Tr >