36 #ifdef CYCLE_PROFILING 37 #include "profiler.hpp" 43 template<
typename strategy,
typename To,
typename Tr>
45 typedef typename strategy::operand_type Toi;
46 typedef typename strategy::result_type Tri;
51 const unsigned int _Msize;
52 const unsigned int _Nsize;
53 const unsigned int _Ksize;
55 const unsigned int _nbatches;
56 const unsigned int _nmulti;
61 const unsigned int _k_block;
62 const unsigned int _n_block;
63 const unsigned int _Mround;
66 const Toi *_B_transposed=
nullptr;
70 static unsigned int compute_k_block(
const GemmArgs &
args) {
72 if (!strategy::supports_accumulate()) {
81 unsigned int target_block_size = 2048 /
sizeof(To);
83 if (args.
_Ksize >= ((3 * target_block_size) / 2)) {
84 unsigned int target_blocks =
iceildiv(args.
_Ksize, target_block_size);
88 block_size =
roundup(block_size, strategy::k_unroll());
98 static unsigned int compute_n_block(
const GemmArgs &args) {
103 n_block /= strategy::out_width();
104 n_block = std::max(n_block, 1u) * strategy::out_width();
119 return strategy::out_width() * 3;
122 return strategy::out_width();
131 : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
132 _nbatches(args._nbatches), _nmulti(args._nmulti),
134 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
150 #ifdef CYCLE_PROFILING 156 assert(_B_transposed);
157 static_assert(std::is_same<To, Toi>::value,
"gemm_native: Operand types must be the same.");
158 static_assert(std::is_same<Tr, Tri>::value,
"gemm_native: Result types must be the same.");
163 for (
unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
164 unsigned int kmax = std::min(k0 + _k_block, _Ksize);
165 unsigned int kern_k =
roundup(kmax-k0, strategy::k_unroll());
167 const bool first_pass = (k0 == 0);
168 const bool last_pass = (kmax == _Ksize);
177 const unsigned int m_start = p.dim(0) * strategy::out_height();
178 const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);
179 const unsigned int batch = p.dim(1);
180 const unsigned int n0 = p.dim(2) * _n_block;
181 const unsigned int nmax = std::min(n0 + _n_block, _Nsize);
182 const unsigned int multi = p.dim(3);
184 const Toi *b_panel = _B_transposed +
185 (multi *
roundup(_Nsize, strategy::out_width()) *
roundup(_Ksize, strategy::k_unroll())) +
186 (k0 *
roundup(_Nsize, strategy::out_width())) +
189 #ifdef CYCLE_PROFILING 190 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(m_end - m_start) * kern_k *
roundup(nmax-n0, strategy::out_width()));
193 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
195 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
196 (m_end - m_start), (nmax - n0), kmax-k0,
197 (strategy::supports_bias() && first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
198 last_pass ? _act :
Activation(), !first_pass);
201 if (!strategy::supports_bias() && this->_bias && first_pass) {
202 bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
203 this->_bias + (multi * this->_bias_multi_stride) + n0,
204 (m_end - m_start), (nmax - n0));
207 }
while (p.next_dim1());
217 return (_B_transposed==
nullptr);
221 return roundup(_Nsize, strategy::out_width()) *
roundup(_Ksize, strategy::k_unroll()) * _nmulti *
sizeof(Toi);
225 Toi *buffer =
reinterpret_cast<Toi *
>(in_buffer);
226 _B_transposed = buffer;
229 for (
unsigned int multi=0; multi<_nmulti; multi++) {
230 for (
unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
231 const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
232 const unsigned int k_size =
roundup(kmax-k0, strategy::k_unroll());
234 for (
unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
235 const unsigned int xmax = std::min(x0+_n_block, _Nsize);
237 const unsigned int size =
roundup(xmax-x0, strategy::out_width()) * k_size;
239 strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
249 _B_transposed =
reinterpret_cast<Toi *
>(in_buffer);
265 if ((args.
_Nsize < strategy::out_width()) || (args.
_Nsize > strategy::out_width() && args.
_Nsize < 2*strategy::out_width())) {
269 uint64_t total_cycles = mac_cycles;
280 c.
filter = get_type_name<strategy>();
T roundup(const T a, const T b)
GemmHybrid & operator=(GemmHybrid &)=delete
NDRangeIterator iterator(unsigned int start, unsigned int end) const
GemmHybrid(GemmHybrid &)=delete
bool B_is_pretransposed() const override
T iceildiv(const T a, const T b)
void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override
Main execute member fucntion.
ndrange_t get_window_size() const override
unsigned int inner_block_size
size_t get_B_pretransposed_array_size() const override
unsigned int outer_block_size
GemmConfig get_config() override
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
void set_pretransposed_B_data(void *in_buffer) override
int_t get_position(int_t d) const
const StratType * strategy
GemmHybrid(const GemmArgs &args)
bool supports_dynamic_scheduling() const override
bool B_pretranspose_required() const override
static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms)
NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it in...
unsigned int total_size() const
void bias_adder(T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
int_t get_position_end(int_t d) const