32 #ifdef CYCLE_PROFILING 33 #include "profiler.hpp" 42 #define ALLOC_ROUND 64 43 #define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND) 51 template<
typename strategy,
typename To,
typename Tr>
53 typedef typename strategy::operand_type Toi;
54 typedef typename strategy::result_type Tri;
59 const unsigned int _Msize;
60 const unsigned int _Nsize;
61 const unsigned int _Ksize;
63 const unsigned int _nbatches;
64 const unsigned int _nmulti;
68 const int _maxthreads;
72 unsigned int _k_block=0;
73 unsigned int _x_block=0;
75 unsigned int _Mround_div=0;
76 unsigned int _Mround=0;
77 unsigned int _Nround_div=0;
78 unsigned int _Nround=0;
81 const Toi *_B_transposed=
nullptr;
82 void *_working_space=
nullptr;
92 unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
94 unsigned int _index=0;
102 , _xmax { parent._Nsize }
111 assert(_x0 <= _xmax);
114 unsigned int xmax() {
115 return std::min(_x0 + _parent._x_block, _xmax);
118 unsigned int kmax() {
119 return std::min(_k0 + _parent._k_block, _parent._Ksize);
129 _x0 += _parent._x_block;
132 _k0 += _parent._k_block;
133 if (_k0 >= _parent._Ksize) {
136 if (_multi >= _parent._nmulti) {
149 unsigned int k0(
void) {
return _k0; }
150 unsigned int x0(
void) {
return _x0; }
151 unsigned int multi(
void) {
return _multi; }
152 unsigned int index(
void) {
return _index; }
153 bool done(
void) {
return _done; }
154 bool newkblock(
void) {
return _newkblock; }
158 size_t get_a_working_size()
const {
159 return ROUND_UP(
sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
163 size_t get_b_working_size()
const {
168 size_t get_c_working_size()
const {
169 return ROUND_UP(
sizeof(Tri) * _x_block * strategy::out_height());
174 void execute_pretranspose(
unsigned int m_start,
unsigned int m_end,
unsigned int n_start,
unsigned int n_end,
int threadid,
int,
int) {
176 assert(_B_transposed);
177 assert(_working_space);
181 #ifdef CYCLE_PROFILING 187 const unsigned int window_per_batch = _Mround / strategy::out_height();
188 unsigned int batch_0 = m_start / window_per_batch;
189 unsigned int batch_end = m_end / window_per_batch;
192 unsigned int m_0 = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
193 unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
195 unsigned int n_0 = std::min(this->_Nsize, strategy::out_width() * n_start);
196 unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
198 blockwalker current(*
this, n_0, n_max);
200 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(_working_space);
202 auto c_panel_start = working_space_bytes;
203 auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
205 auto c_panel =
reinterpret_cast<Tri *
>(c_panel_start + get_c_working_size() * threadid);
206 auto a_panel =
reinterpret_cast<Toi *
>(a_panel_start + get_a_working_size() * threadid);
213 const Toi *b_panel_start = _B_transposed;
215 const Toi *b_panel = b_panel_start;
218 unsigned b_page_size = 0;
220 for (;!current.done();current.advance()) {
221 int bblocks =
iceildiv(current.xmax() - current.x0(), strategy::out_width());
223 if (current.newkblock()) {
224 kern_k =
iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
225 kern_k *= strat.k_unroll();
227 unsigned b_thread_start_offset =
iceildiv(current.x0(), strategy::out_width());
229 b_panel_start += b_page_size;
230 b_panel = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k);
231 b_page_size = _Nround * kern_k;
234 unsigned int first_m = (
batch == batch_0) ? m_0 : 0;
235 unsigned int last_m = (
batch == batch_end) ? m_max : _Msize;
237 if (first_m >= last_m)
240 auto a_thread_panel_in = this->_Aptr
241 + (
batch * this->_A_batch_stride)
242 + (current.multi() * this->_A_multi_stride);
244 auto a_thread_panel_out = a_panel + ((
batch * _Mround + first_m) * _k_block);
246 strat.transforms.PrepareA(
260 unsigned int first_m = (
batch == batch_0) ? m_0 : 0;
261 unsigned int last_m = (
batch == batch_end) ? m_max : _Msize;
263 const Toi *a_ptr = a_panel + (
batch * _Mround + first_m) * _k_block;
265 if (first_m >= last_m)
268 for (
unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
269 unsigned int ymax = std::min(_Msize, y + strategy::out_height());
271 strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
272 a_ptr += (strategy::out_height() * kern_k);
275 const bool first_pass = current.k0()==0;
276 const bool last_pass = current.kmax()==_Ksize;
278 auto c_panel_out = this->_Cptr
279 + this->_C_batch_stride *
batch 280 + this->_C_multi_stride * current.multi();
282 auto bias = (first_pass && this->_bias)
283 ? this->_bias + (current.multi() * this->_bias_multi_stride)
288 strat.transforms.Merge(
302 b_panel += (bblocks * strat.out_width() * kern_k);
306 static unsigned int get_k_block_size(
const GemmArgs &
args) {
313 unsigned int k_block;
317 k_block = (L1_size / 2) / (
sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
320 k_block /= strategy::k_unroll();
321 k_block = std::max(k_block, 1U) * strategy::k_unroll();
330 k_block =
iceildiv(k_block, strategy::k_unroll());
331 k_block *= strategy::k_unroll();
343 , _Msize(args._Msize)
344 , _Nsize(args._Nsize)
345 , _Ksize(args._Ksize)
346 , _nbatches(args._nbatches)
347 , _nmulti(args._nmulti)
349 , _maxthreads(args._maxthreads)
350 , _nthreads(args._maxthreads)
351 , _k_block(get_k_block_size(args))
354 , _Mround ( _Mround_div *
strategy::out_height() )
357 , _Nround ( _Nround_div *
strategy::out_width() )
359 assert(_maxthreads > 0);
368 _x_block = (((L2_size * 9) / 10) - (_k_block *
sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
369 (
sizeof(Toi) * _k_block);
372 _x_block /= strategy::out_width();
373 _x_block = std::max(_x_block, 1U) * strategy::out_width();
376 unsigned int num_x_blocks =
iceildiv(_Nsize, _x_block);
377 _x_block =
iceildiv(_Nsize, num_x_blocks);
379 _x_block =
iceildiv(_x_block, strategy::out_width());
380 _x_block *= strategy::out_width();
386 unsigned m = (_Mround / strategy::out_height()) * _nbatches;
387 unsigned n = _Nround_div;
398 _nthreads = std::min(nthreads, _maxthreads);
407 const auto m_size = work_range.
get_size(0);
408 const auto n_size = work_range.
get_size(1);
409 const auto m_end = m_start + m_size;
410 const auto n_end = n_start + n_size;
415 execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
427 return get_c_working_size() * _maxthreads
428 + get_a_working_size() * _maxthreads
435 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(working_space);
436 intptr_t working_space_int =
reinterpret_cast<intptr_t
>(working_space);
440 if (working_space_int & 0x3F) {
441 diff = 0x40 - (working_space_int & 0x3F);
444 working_space_bytes += diff;
446 _working_space =
reinterpret_cast<void *
>(working_space_bytes);
455 return _B_transposed==
nullptr;
461 blockwalker current(*
this);
465 unsigned int x_size = (current.xmax() - current.x0());
466 unsigned int k_size = (current.kmax() - current.k0());
469 x_size =
iceildiv(x_size, strategy::out_width());
470 x_size *= strategy::out_width();
472 k_size =
iceildiv(k_size, strategy::k_unroll());
473 k_size *= strategy::k_unroll();
475 total += x_size * k_size *
sizeof(Toi);
476 }
while (current.advance());
482 blockwalker current(*
this);
483 Toi *buffer =
reinterpret_cast<Toi *
>(in_buffer);
484 _B_transposed = buffer;
489 unsigned int x_size = (current.xmax() - current.x0());
490 unsigned int k_size = (current.kmax() - current.k0());
493 x_size =
iceildiv(x_size, strategy::out_width());
494 x_size *= strategy::out_width();
496 k_size =
iceildiv(k_size, strategy::k_unroll());
497 k_size *= strategy::k_unroll();
499 strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
500 current.x0(), current.xmax(), current.k0(), current.kmax());
502 buffer += (x_size * k_size);
503 }
while (current.advance());
507 _B_transposed =
reinterpret_cast<Toi *
>(in_buffer);
512 unsigned int k_blocks =
iceildiv(args.
_Ksize, get_k_block_size(args));
514 unsigned int n_blocks =
iceildiv(args.
_Nsize, strategy::out_width());
522 float ratio = m_blocks /
static_cast<float>(n_blocks);
524 unsigned int ideal_height =
static_cast<unsigned int>(std::sqrt(args.
_maxthreads * ratio) + 0.5);
525 unsigned int height = 1;
527 if (ideal_height == 0) {
530 for (
unsigned int adj=0; adj<ideal_height; adj++) {
531 const unsigned int round_down = ideal_height - adj;
537 const unsigned int round_up = ideal_height + adj;
552 float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
559 total_cycles *= (
static_cast<float>(args.
_maxthreads) / parallelism_available);
562 return static_cast<uint64_t
>(total_cycles);
T roundup(const T a, const T b)
ndrange_t get_window_size() const override
static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters ¶ms)
void set_pretransposed_B_data(void *in_buffer) override
T iceildiv(const T a, const T b)
GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &)=delete
void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) override
Main execute member fucntion.
unsigned int inner_block_size
unsigned int outer_block_size
bool B_pretranspose_required() const override
std::size_t get_working_size() const override
int_t get_position(int_t d) const
void advance(CharPosition &pos, char ch)
bool B_is_pretransposed() const override
bool supports_dynamic_scheduling() const override
const StratType * strategy
unsigned int get_L1_cache_size() const
Gets the L1 cache size.
unsigned int get_L2_cache_size() const
Gets the L2 cache size.
void set_nthreads(int nthreads) override
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it in...
void set_working_space(void *working_space) override
GemmInterleavedPretransposed2d & operator=(GemmInterleavedPretransposed2d &)=delete
unsigned int get_size(unsigned int v) const
size_t get_B_pretransposed_array_size() const override
GemmInterleavedPretransposed2d(const GemmArgs &args)