36 #ifdef CYCLE_PROFILING
37 #include "profiler.hpp"
43 template<
typename strategy,
typename To,
typename Tr>
45 typedef typename strategy::operand_type Toi;
46 typedef typename strategy::result_type Tri;
51 const unsigned int _Msize;
52 const unsigned int _Nsize;
53 const unsigned int _Ksize;
55 const unsigned int _nbatches;
56 const unsigned int _nmulti;
61 const unsigned int _k_block;
62 const unsigned int _n_block;
63 const unsigned int _Mround;
66 const Toi *_B_transposed=
nullptr;
70 static unsigned int compute_k_block(
const GemmArgs &
args) {
72 if (!strategy::supports_accumulate()) {
76 if (
args._cfg &&
args._cfg->inner_block_size) {
77 return roundup(
args._cfg->inner_block_size, strategy::k_unroll());
81 unsigned int target_block_size = 2048 /
sizeof(To);
83 if (
args._Ksize >= ((3 * target_block_size) / 2)) {
84 unsigned int target_blocks =
iceildiv(
args._Ksize, target_block_size);
86 unsigned int block_size =
iceildiv(
args._Ksize, target_blocks);
88 block_size =
roundup(block_size, strategy::k_unroll());
98 static unsigned int compute_n_block(
const GemmArgs &
args) {
99 if (
args._cfg &&
args._cfg->outer_block_size) {
100 unsigned int n_block =
args._cfg->outer_block_size;
103 n_block /= strategy::out_width();
104 n_block = std::max(n_block, 1u) * strategy::out_width();
109 if (
args._Nsize <= 64) {
113 if ((
args._Msize /
args._Nsize) > 155) {
118 if ((
args._Ksize <= 128) && (
args._maxthreads <= 16)) {
119 return strategy::out_width() * 3;
122 return strategy::out_width();
131 : _ci(
args._ci), _Msize(
args._Msize), _Nsize(
args._Nsize), _Ksize(
args._Ksize),
132 _nbatches(
args._nbatches), _nmulti(
args._nmulti),
134 _k_block(compute_k_block(
args)), _n_block(compute_n_block(
args)),
150 #ifdef CYCLE_PROFILING
156 assert(_B_transposed);
157 static_assert(std::is_same<To, Toi>::value,
"gemm_native: Operand types must be the same.");
158 static_assert(std::is_same<Tr, Tri>::value,
"gemm_native: Result types must be the same.");
163 for (
unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
164 unsigned int kmax = std::min(k0 + _k_block, _Ksize);
165 unsigned int kern_k =
roundup(kmax-k0, strategy::k_unroll());
167 const bool first_pass = (k0 == 0);
168 const bool last_pass = (kmax == _Ksize);
177 const unsigned int m_start = p.dim(0) * strategy::out_height();
178 const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);
179 const unsigned int batch = p.dim(1);
180 const unsigned int n0 = p.dim(2) * _n_block;
181 const unsigned int nmax = std::min(n0 + _n_block, _Nsize);
182 const unsigned int multi = p.dim(3);
184 const Toi *b_panel = _B_transposed +
185 (multi *
roundup(_Nsize, strategy::out_width()) *
roundup(_Ksize, strategy::k_unroll())) +
186 (k0 *
roundup(_Nsize, strategy::out_width())) +
189 #ifdef CYCLE_PROFILING
190 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(m_end - m_start) * kern_k *
roundup(nmax-n0, strategy::out_width()));
193 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
195 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
196 (m_end - m_start), (nmax - n0), kmax-k0,
197 (strategy::supports_bias() && first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) + n0 :
nullptr,
198 last_pass ? _act :
Activation(), !first_pass);
201 if (!strategy::supports_bias() && this->_bias && first_pass) {
202 bias_adder(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
203 this->_bias + (multi * this->_bias_multi_stride) + n0,
204 (m_end - m_start), (nmax - n0));
207 }
while (p.next_dim1());
217 return (_B_transposed==
nullptr);
221 return roundup(_Nsize, strategy::out_width()) *
roundup(_Ksize, strategy::k_unroll()) * _nmulti *
sizeof(Toi);
225 Toi *buffer =
reinterpret_cast<Toi *
>(in_buffer);
226 _B_transposed = buffer;
229 for (
unsigned int multi=0; multi<_nmulti; multi++) {
230 for (
unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
231 const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
232 const unsigned int k_size =
roundup(kmax-k0, strategy::k_unroll());
234 for (
unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
235 const unsigned int xmax = std::min(x0+_n_block, _Nsize);
237 const unsigned int size =
roundup(xmax-x0, strategy::out_width()) * k_size;
239 strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
249 _B_transposed =
reinterpret_cast<Toi *
>(in_buffer);
257 uint64_t total_macs =
static_cast<uint64_t
>(
args._nbatches) *
args._nmulti *
args._Msize *
roundup(
args._Nsize, strategy::out_width()) *
roundup(
args._Ksize, strategy::k_unroll());
265 if ((
args._Nsize < strategy::out_width()) || (
args._Nsize > strategy::out_width() &&
args._Nsize < 2*strategy::out_width())) {
269 uint64_t total_cycles = mac_cycles;
280 c.
filter = get_type_name<strategy>();