37 #ifdef CYCLE_PROFILING
38 #include "profiler.hpp"
44 template<
typename strategy,
typename To,
typename Tr>
46 typedef typename strategy::operand_type Toi;
47 typedef typename strategy::result_type Tri;
52 const unsigned int _Msize;
53 const unsigned int _Nsize;
54 const unsigned int _Ksize;
56 const unsigned int _nbatches;
57 const unsigned int _nmulti;
60 const unsigned int _k_block;
61 const unsigned int _n_block;
62 const unsigned int _Mround;
65 const Toi *_B_transposed=
nullptr;
70 int32_t *col_bias =
nullptr;
72 void *working_space =
nullptr;
74 unsigned int _nthreads;
76 unsigned int get_col_sum_size()
const {
77 return _Nsize * _nmulti *
sizeof(int32_t);
80 static unsigned int compute_k_block(
const GemmArgs &
args) {
84 if (
args._cfg &&
args._cfg->inner_block_size) {
85 return args._cfg->inner_block_size;
88 const unsigned int L1_size =
args._ci->get_L1_cache_size();
92 unsigned int k_block = (L1_size / 2) / (
sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
95 k_block /= strategy::k_unroll();
96 k_block = std::max(k_block, 1U) * strategy::k_unroll();
99 unsigned int numk_blocks =
iceildiv(
args._Ksize, k_block);
105 k_block =
roundup(k_block, strategy::k_unroll());
110 static unsigned int compute_n_block(
const GemmArgs &
args) {
111 if (
args._cfg &&
args._cfg->outer_block_size) {
112 return args._cfg->outer_block_size;
115 const unsigned int k_block = compute_k_block(
args);
116 const unsigned int L2_size =
args._ci->get_L2_cache_size();
120 unsigned int n_block = (((L2_size * 9) / 10) - (k_block *
sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
121 (
sizeof(Toi) * k_block);
124 n_block /= strategy::out_width();
125 n_block = std::max(n_block, 1U) * strategy::out_width();
128 unsigned int numblocks =
iceildiv(
args._Nsize, n_block);
130 n_block =
roundup(n_block, strategy::out_width());
141 : _ci(
args._ci), _Msize(
args._Msize), _Nsize(
args._Nsize), _Ksize(
args._Ksize),
142 _nbatches(
args._nbatches), _nmulti(
args._nmulti),
143 _k_block(compute_k_block(
args)), _n_block(compute_n_block(
args)),
146 _qp (qp), _nthreads(
args._maxthreads) { }
160 #ifdef CYCLE_PROFILING
166 assert(_B_transposed);
167 static_assert(std::is_same<To, Toi>::value,
"gemm_native: Operand types must be the same.");
172 for (
unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
173 unsigned int kmax = std::min(k0 + _k_block, _Ksize);
174 unsigned int kern_k =
roundup(kmax-k0, strategy::k_unroll());
183 const unsigned int m_start = p.dim(0) * strategy::out_height();
184 const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);
185 const unsigned int batch = p.dim(1);
186 const unsigned int n0 = p.dim(2) * _n_block;
187 const unsigned int nmax = std::min(n0 + _n_block, _Nsize);
188 const unsigned int multi = p.dim(3);
190 const Toi *b_panel = _B_transposed +
191 (multi *
roundup(_Nsize, strategy::out_width()) *
roundup(_Ksize, strategy::k_unroll())) +
192 (k0 *
roundup(_Nsize, strategy::out_width())) +
196 #ifdef CYCLE_PROFILING
197 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k *
roundup(nmax-n0, strategy::out_width()));
199 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
201 this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
202 (m_end - m_start), (nmax - n0), kmax - k0,
203 col_bias + (multi * _Nsize) + n0, _qp);
205 }
while (p.next_dim1());
215 return (_B_transposed==
nullptr);
219 return get_col_sum_size() + (
roundup(_Nsize, strategy::out_width()) *
roundup(_Ksize, strategy::k_unroll()) * _nmulti *
sizeof(Toi));
222 void requantize_bias(
void *in_buffer,
const To *B,
const int ldb,
const int B_multi_stride)
override {
223 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
225 for (
unsigned int i=0; i<_nmulti; i++) {
226 compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize, i, 0);
233 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
234 Toi *buffer =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
235 _B_transposed = buffer;
238 for (
unsigned int multi=0; multi<_nmulti; multi++) {
239 for (
unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
240 const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
241 const unsigned int k_size =
roundup(kmax-k0, strategy::k_unroll());
243 for (
unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
244 const unsigned int xmax = std::min(x0+_n_block, _Nsize);
246 const unsigned int size =
roundup(xmax-x0, strategy::out_width()) * k_size;
248 strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
258 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(in_buffer);
259 _B_transposed =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
260 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);