Compute Library
 21.02
gemm_interleaved.hpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #include <algorithm>
27 #include <cassert>
28 
29 #include "arm_gemm.hpp"
30 #include "convolver.hpp"
31 #include "mergeresults.hpp"
33 #include "quantized.hpp"
34 #include "transform.hpp"
35 #include "utils.hpp"
36 
37 #ifdef CYCLE_PROFILING
38 #include "profiler.hpp"
39 #endif
40 
41 // Some macros used to decide how much working space to allocate.
42 // Round allocations up to the next cache line.
43 #define ALLOC_ROUND 64
44 #define ROUND_UP(x) ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
45 
46 // Implementation of the GemmCommon abstract class.
47 //
48 // This implementation interleaves the source matrices in blocks - good for
49 // larger matrices.
50 
51 namespace arm_gemm {
52 
53 namespace {
54 
55 // Some kernels output to a linear buffer and require a separate merge step.
56 // Others output directly to the matrix result. This helper class calls the
57 // appropriate functions, using templating to avoid calling non-existent
58 // functions.
59 template<bool MergeStep, typename OutputStage>
60 class kernel_and_merge {
61 public:
62  template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
63  static void run (
64 #ifdef CYCLE_PROFILING
65  profiler &prof,
66 #endif
67  strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
68  Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
69  unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
70  const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
71  Tab *acc_buff);
72 };
73 
74 // Run a kernel and call the separate merge step
75 template<>
76 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
78 #ifdef CYCLE_PROFILING
79  profiler &prof,
80 #endif
81  strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
82  Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
83  unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
84  const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
85 {
86  const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
87 
88  {
89 #ifdef CYCLE_PROFILING
90  auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
91 #endif
92 
93  strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
94  }
95 
96  {
97 #ifdef CYCLE_PROFILING
98  auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
99 #endif
100  strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
101  }
102 }
103 
104 // Run a kernel with integrated merge
105 template<>
106 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
108 #ifdef CYCLE_PROFILING
109  profiler &prof,
110 #endif
111  strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
112  Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
113  unsigned int n_0, unsigned int n_max, const Tr *biasptr,
114  const Activation &act, bool accumulate, const Nothing &, const int32_t *,
115  Tab *acc_buff)
116 {
117 #ifdef CYCLE_PROFILING
118  auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
119 #endif
120 
121  // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need
122  // to be careful not to offset a null pointer.
123  Tri *offset_c_ptr;
124 
125  if (c_ptr == nullptr) {
126  offset_c_ptr = nullptr;
127  } else {
128  offset_c_ptr = c_ptr + m_0 * ldc + n_0;
129  }
130 
131  strat.kernel(// A and B pointers are just the packed panels.
132  a_ptr, b_panel,
133  // Provide relevant part of output array and row stride.
134  offset_c_ptr, ldc,
135  // M, N, K sizes
136  m_max-m_0, n_max - n_0, kern_k,
137  // Bias, activation, accumulation. Need to offset the bias as needed.
138  biasptr ? biasptr + n_0 : nullptr, act, accumulate,
139  // Accumulation buffer.
140  acc_buff );
141 }
142 
143 // Run a kernel with integrated merge, quantizing
144 template<>
145 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
147 #ifdef CYCLE_PROFILING
148  profiler &prof,
149 #endif
150  strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
151  Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
152  unsigned int n_0, unsigned int n_max, const Tr *,
153  const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
154  Tab *acc_buff)
155 {
156 #ifdef CYCLE_PROFILING
157  auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
158 #endif
159 
160  strat.kernel(// A and B pointers are just the packed panels.
161  a_ptr, b_panel,
162  // Provide relevant part of output array and row stride.
163  c_ptr + m_0 * ldc + n_0, ldc,
164  // M, N, K sizes
165  m_max-m_0, n_max - n_0, kern_k,
166  // Bias, activation, accumulation. Need to offset the bias as needed.
167  col_bias + n_0, qp, n_0, accumulate, acc_buff);
168 }
169 
170 // Run a kernel and call the separate quantize step
171 template<>
172 template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
174 #ifdef CYCLE_PROFILING
175  profiler &prof,
176 #endif
177  strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
178  Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
179  unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
180  const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
181  Tab *)
182 {
183  const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
184 
185  {
186 #ifdef CYCLE_PROFILING
187  auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
188 #endif
189 
190  strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
191  }
192 
193  {
194 #ifdef CYCLE_PROFILING
195  auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
196 #endif
197  // The interleaved kernel outputs in blocks - each block is a
198  // row-major matrix of size out_width * out_height. The merge
199  // kernels are designed to deal with this but the requantizer is
200  // not, so we need to requantize one block at a time.
201  for (int i=0; i<bblocks; i++) {
202  unsigned int n_start = n_0 + (strategy::out_width() * i);
203  unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
204 
205  // The row bias is interleaved with the transposed A data, get a pointer to it here.
206  const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
207 
208  requantize_block_32(qp, (n_end - n_start), (m_max-m_0),
209  c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
210  c_ptr + m_0 * ldc + n_start, ldc,
211  row_bias, col_bias + n_start, n_start);
212  }
213  }
214 }
215 
216 // Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
217 // "requantizing" context where the output will be requantized.
218 //
219 // These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and
220 // if we are not we don't.
221 //
222 // This helper class allows the appropriate transforms to be found, without requiring kernels that don't support
223 // quantization to define useless "quantized" transforms.
224 template<typename strategy, bool quantized>
225 class transform_type {
226 public:
227  typedef decltype(strategy::transforms) type;
228 };
229 
230 template<typename strategy>
231 class transform_type<strategy, true> {
232 public:
233  typedef decltype(strategy::transforms_quantized) type;
234 };
235 
236 // We need a similar trick here to figure out what type the accumulator buffer should be.
237 template<typename strategy, typename OutputStage>
238 class accumulate_buffer_type {
239 public:
240  typedef typename strategy::result_type type;
241 };
242 
243 template<typename strategy>
244 class accumulate_buffer_type<strategy, Requantize32> {
245 public:
246  typedef int32_t type;
247 };
248 
249 } // anonymous namespace
250 
251 template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
252 class GemmInterleaved : public GemmCommon<To, Tr> {
253  typedef typename strategy::operand_type Toi;
254  typedef typename strategy::result_type Tri;
255  typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
256 
257  /* const properties set by constructor */
258  const CPUInfo * const _ci;
259 
260  const unsigned int _Msize;
261  const unsigned int _Nsize;
262  const unsigned int _Ksize;
263  const unsigned int _Ksections;
264  const unsigned int _Ktotal;
265  const unsigned int _rounded_Ksize;
266 
267  const unsigned int _nbatches;
268  const unsigned int _nmulti;
269 
270  const bool _thread_columns;
271 
272  const Activation _act;
273 
274  const int _maxthreads;
275  int _nthreads;
276 
277  /* Blocking info */
278  unsigned int _k_block=0;
279  unsigned int _x_block=0;
280  unsigned int _Mround=0;
281 
282  /* Working space, pretransposed buffer, buffer manager */
283  const Toi *_B_transposed=nullptr;
284  void *_working_space=nullptr;
285 
286  Tab *_accumulation_buffer=nullptr;
287 
288  /* Output stage */
289  OutputStage _os;
290 
291  /* Quantized support (in addition to 'output stage' above */
292  int32_t *col_bias = nullptr;
293 
294  /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
295  const To * const * const * _indirect_buf = nullptr;
296 
297  /* Convolver - only set up for convolution problems, so also doubles as a flag. */
298  std::unique_ptr<convolver<To>> _convolver = nullptr;
299 
300  unsigned int get_col_sum_size() const {
301  if (std::is_same<OutputStage, Requantize32>::value) {
302  return _Nsize * _nmulti * sizeof(int32_t);
303  } else {
304  return 0;
305  }
306  }
307 
308  /* We will need to walk through the blocks of B in a few contexts, so
309  * factor that out. */
310  class blockwalker {
311  private:
312  /* Size loops, etc. based on our parent's configuration */
314 
315  /* K, X and multi parameters for current iteration. */
316  unsigned int _k0=0, _x0=0, _multi=0;
317 
318  /* Range of X to iterate over - used in "ForceThreadColumns" cases */
319  unsigned int _x_start=0;
320  unsigned int _x_end=_parent._Nsize;
321 
322  unsigned int _index=0;
323  bool _done=false;
324  bool _newkblock=true;
325  bool _newmulti=true;
326 
327  public:
328  blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
329 
331  unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
332 
333  unsigned int xmax() {
334  return std::min(_x0 + _parent._x_block, _x_end);
335  }
336 
337  unsigned int kmax() {
338  return std::min(_k0 + _parent._k_block, _parent._Ktotal);
339  }
340 
341  /* Advance to the next block, return false at the end. */
342  bool advance(void) {
343  if (_done) {
344  return false;
345  }
346 
347  _newkblock=false;
348  _x0 += _parent._x_block;
349  if (_x0 >= _x_end) {
350  _x0=_x_start;
351  _k0 += _parent._k_block;
352  if (_k0 >= _parent._Ktotal) {
353  _k0=0;
354  _multi++;
355  if (_multi >= _parent._nmulti) {
356  _done=true;
357  return false;
358  }
359  _newmulti=true;
360  }
361  _newkblock=true;
362  }
363  _index++;
364 
365  return true;
366  }
367 
368  unsigned int k0(void) { return _k0; }
369  unsigned int x0(void) { return _x0; }
370  unsigned int multi(void) { return _multi; }
371  unsigned int index(void) { return _index; }
372  bool done(void) { return _done; }
373  bool newkblock(void) { return _newkblock; }
374  };
375 
376  // "k block" has two distinct uses: figuring out which iterations of K
377  // to actually process, but also various size/pointer computations. The
378  // latter needs to take account of the extra space needed for the row
379  // sums, if appropriate.
380  unsigned int get_total_k_depth() const {
381  unsigned int k_depth = _k_block;
382 
383  if (std::is_same<OutputStage, Requantize32>::value) {
384  k_depth += sizeof(int32_t) / sizeof(Toi);
385  }
386 
387  return k_depth;
388  }
389 
390  // A working size.
391  size_t get_a_working_size() const {
392  if (_thread_columns) {
393  // For 2D threading: allocate a buffer of one block of rows per thread
394  return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
395  } else {
396  // For 1D threaded: one of these needed, regardless of thread count. Divided according to window.
397  return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
398  }
399  }
400 
401  // C working size: One needed per thread. Not needed if there is no merge step.
402  size_t get_c_working_size() const {
403  if (MergeStep) {
404  return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
405  } else {
406  return 0;
407  }
408  }
409 
410  // Accumulation buffer size
411  size_t get_accumulation_buffer_size() const {
412  // We only support an accumulation buffer for non-merge cases.
413  if (MergeStep) {
414  return 0;
415  }
416 
417  // Check if we are actually blocking
418  if (_k_block == _Ktotal) {
419  return 0;
420  }
421 
422  // We are no-merge, non-quantized with active blocking: accumulation buffer needed.
423  size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width();
424  size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
425 
426  return num_buffers * size_per_buffer;
427  }
428 
429  // Get pointer into accumulation buffer
430  Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const {
431  // Don't do anything if there's no buffer.
432  if (_accumulation_buffer == nullptr) {
433  return nullptr;
434  }
435 
436  // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes.
437  size_t size_per_buffer = strategy::out_height() * strategy::out_width();
438 
439  size_t buffer_rows = iceildiv(_Msize, strategy::out_height());
440  size_t buffer_cols = iceildiv(_Nsize, strategy::out_width());
441  size_t buffers_per_batch = (buffer_rows * buffer_cols);
442  size_t buffers_per_multi = buffers_per_batch * _nbatches;
443 
444  // M/N must reference the top-left corner of a block.
445  size_t row = M / strategy::out_height();
446  assert(M % strategy::out_height() == 0);
447  size_t col = N / strategy::out_width();
448  assert(N % strategy::out_width() == 0);
449 
450  size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
451 
452  return _accumulation_buffer + (buffer_index * size_per_buffer);
453  }
454 
455  int32_t row_sum_multiplier() const {
456  if (std::is_same<OutputStage, Requantize32>::value) {
457  const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
458 
459  return -qp->b_offset;
460  }
461 
462  return 0;
463  }
464 
465  // Heuristics to decide whether to use the 'thread columns' regime
466  static bool is_thread_columns(const GemmArgs &args) {
467  // For now, there is a templace parameter to force it.
468  if (ForceThreadColumns) {
469  return true;
470  }
471 
472  // Never do this for single threaded cases.
473  if (args._maxthreads == 1) {
474  return false;
475  }
476 
477  // How many blocks of work are available for threading on M?
478  int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
479 
480  // If we just can't share the work across threads with the row threading regime.
481  if (args._maxthreads > m_blocks) {
482  return true;
483  }
484 
485  // If the row threading regime is too wasteful (20% threshold)
486  if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) {
487  return true;
488  }
489 
490  return false;
491  }
492 
493  static unsigned int get_ktotal(const GemmArgs &args) {
494  return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
495  }
496 
497  static unsigned int get_k_block_size(const GemmArgs &args) {
498  if (args._cfg && args._cfg->inner_block_size) {
499  return args._cfg->inner_block_size;
500  }
501 
502  // K blocking not supported if we are requantizing.
503  if (std::is_same<OutputStage, Requantize32>::value) {
504  return get_ktotal(args);
505  }
506 
507  const unsigned int L1_size = args._ci->get_L1_cache_size();
508  unsigned int k_block;
509 
510  // k_block: Find out how much of the larger array can be loaded into half the cache.
511  // This should account for associative caches.
512  k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
513 
514  // Needs to be (at least a single) multiple of the K unroll level.
515  k_block /= strategy::k_unroll();
516  k_block = std::max(k_block, 1U) * strategy::k_unroll();
517 
518  // Now tune to presented problem size; this is how many blocks we need.
519  unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block);
520 
521  // So divide the space equally into that many blocks.
522  k_block = iceildiv(get_ktotal(args), num_k_blocks);
523 
524  // And round UP to the K unroll level required.
525  k_block = roundup(k_block, strategy::k_unroll());
526 
527  assert(k_block > 0);
528 
529  return k_block;
530  }
531 
532  static unsigned int get_x_block_size(const GemmArgs &args) {
533  if (is_thread_columns(args)) {
534  // In 2D mode, override X block, because we will process width first.
535  return roundup(args._Nsize, strategy::out_width());
536  }
537 
538  if (args._cfg && args._cfg->outer_block_size) {
539  return roundup(args._cfg->outer_block_size, strategy::out_width());
540  }
541 
542  unsigned int x_block;
543  const unsigned int L2_size = args._ci->get_L2_cache_size();
544  const unsigned int k_block = get_k_block_size(args);
545 
546  // x_block: Work out how many rows (of length k_block) will fit in the L2
547  // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
548  const unsigned int scaled_l2_size = (L2_size * 9) / 10;
549  const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
550 
551  // .. if the L1 contents is bigger than the L2, just return a minimal size block.
552  if (k_block_area > scaled_l2_size) {
553  return strategy::out_width();
554  }
555 
556  x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
557 
558  // Needs to be (at least a single) multiple of the kernel output width.
559  x_block /= strategy::out_width();
560  x_block = std::max(x_block, 1u) * strategy::out_width();
561 
562  // And tune to the presented problem size.
563  unsigned int num_x_blocks = iceildiv(args._Nsize, x_block);
564  x_block = iceildiv(args._Nsize, num_x_blocks);
565 
566  x_block = roundup(x_block, strategy::out_width());
567 
568  assert(x_block > 0);
569 
570  return x_block;
571  }
572 
573 public:
574  GemmInterleaved(GemmInterleaved &) = delete;
575  GemmInterleaved & operator= (GemmInterleaved &) = delete;
576 
577  /* Constructor */
578  GemmInterleaved(const GemmArgs &args, const OutputStage &os)
579  : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
580  _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
581  _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
582  _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
583  _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
584  _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
585  _os(os) { }
586 
587  /* Constructor without OutputStage */
588  GemmInterleaved(const GemmArgs &args)
589  : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
590  _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
591  _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
592  _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
593  _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
594  _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
595  _os() { }
596 
597  // Interface implementation - Compulsory functions
598 
599  // Window size: Only the last thread should do a ragged block, so dole
600  // out work in units of out_height. Factor batches into the window, but
601  // not multi for now (as this would cause problems with the buffer
602  // manager).
603  ndrange_t get_window_size() const override {
604  unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
605 
606  if (_thread_columns) {
607  return { row_blocks, iceildiv(_Nsize, strategy::out_width()) };
608  } else {
609  // _Mround is a multiple of out_height by definition.
610  return { row_blocks };
611  }
612  }
613 
614  // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
615  void set_nthreads(int nthreads) override {
616  _nthreads = std::min(nthreads, _maxthreads);
617  }
618 
619  // Execute
620  void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
621 #ifdef CYCLE_PROFILING
622  profiler prof;
623 #endif
624 
625  /* Make sure we've been set up correctly. */
626  assert(_B_transposed);
627  assert(_working_space);
628  int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
629 
630  /* Align if needed */
631  intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
632  if (working_space_v & 0x3f) {
633  intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
634  working_space_bytes += alignment_offset;
635  }
636 
637  strategy strat(_ci);
638 
639  const auto start = work_range.get_position(0);
640  const auto end = work_range.get_position_end(0);
641 
642  /* Translate 'start' and 'end' into a position within the batches and rows. */
643  const unsigned int window_per_batch = _Mround / strategy::out_height();
644  unsigned int batch_0 = start / window_per_batch;
645  unsigned int batch_end = end / window_per_batch;
646 
647  // In ThreadColumns mode, process work one horizontal strip at a time.
648  // Transpose the block of needed rows at the start, then do all the work on that block.
649  if (_thread_columns) {
650  const auto start_x = work_range.get_position(1) * strategy::out_width();
651  const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
652 
653  Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
654  Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
655  (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
656 
657  for (unsigned int multi=0; multi<_nmulti; multi++) {
658  for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
659  unsigned int kmax=std::min(k0+_k_block, _Ktotal);
660 
661  unsigned int rounded_width = roundup(_Nsize, strategy::out_width());
662 
663  const bool first_pass = (k0==0);
664  const bool last_pass = (kmax==_Ktotal);
665 
666  // Figure out how many "K" the kernel will actually process.
667  unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
668 
669  const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
670 
671  unsigned int batch = batch_0;
672  unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
673 
674  for (unsigned int p=start; p<end; p++) {
675  unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
676 
677  // Set up transposed 'A' block
678  {
679 #ifdef CYCLE_PROFILING
680  auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
681 #endif
682  // See comment above on transform_type<> class: this extracts either 'transforms' or
683  // 'transforms_quantized' as appropriate.
684  typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
685 
686  if (_indirect_buf != nullptr) {
687  transforms.PrepareA_indirect(a_panel,
688  _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
689  _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
690  } else if (_convolver) {
691  transforms.PrepareA_convolution(a_panel,
692  this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
693  this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
694  } else {
695  transforms.PrepareA(a_panel,
696  this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
697  this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
698  }
699  }
700 
701  // Perform the kernel and merge step, either separately or together as required.
703  #ifdef CYCLE_PROFILING
704  prof,
705  #endif
706  // Strategy and panel pointers
707  strat, a_panel, b_ptr, c_panel,
708  // Result buffer pointers
709  this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
710  // K size, and M/N ranges
711  kern_k, start_row, end_row, start_x, end_x,
712  // Only do bias on the first pass
713  ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
714  // Only do activation on the last pass, and accumulation on any non-first pass.
715  (last_pass ? _act : Activation()), !first_pass,
716  // Pass in quantization parameters for requantizing kernels (others will ignore)
717  _os, col_bias + (multi * _Nsize),
718  // Accumulation buffer (not yet implemented on this path)
719  static_cast<Tab *>(nullptr));
720 
721  /* Increment to the next block */
722  start_row += strategy::out_height();
723  if (start_row >= _Msize) {
724  start_row = 0;
725  batch++;
726  }
727  }
728  }
729  }
730  } else {
731  blockwalker current(*this);
732 
733  /* Compute the M values to operate on */
734  unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
735  unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
736 
737  // Private buffers. Treat working_space as an array of C buffers
738  // (one per thread) first, followed by the (window-divided) A
739  // buffer.
740  // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
741  Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
742  Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
743 
744  const Toi *b_panel;
745  b_panel = _B_transposed;
746 
747  // newkblock() is always true on the first iteration, so these will be set properly on the first loop.
748 
749  // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total
750  // stride of the A panel (i.e. with 4 added for cases with embedded row sums)
751 
752  // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and
753  // used for addressing inside a_panel.
754 
755  // In cases where K blocking is in use and the blocks are not all the same size, the (smaller) final block
756  // won't use all the memory allocated.
757  unsigned int kern_k = 0;
758  unsigned int a_panel_stride = 0;
759 
760  for (;!current.done();current.advance()) {
761  if (current.newkblock()) {
762 #ifdef CYCLE_PROFILING
763  auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
764 #endif
765  // See comment above on transform_type<> class: this extracts either 'transforms' or
766  // 'transforms_quantized' as appropriate.
767  typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
768 
769  for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
770  unsigned int first_m = (batch == batch_0) ? m_0 : 0;
771  unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
772 
773  if (first_m >= last_m)
774  continue;
775 
776  if (_indirect_buf != nullptr) {
777  transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
778  _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
779  _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
780  } else if (_convolver) {
781  transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
782  this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
783  this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
784  } else {
785  transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
786  this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
787  this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
788  }
789  }
790 
791  // Figure out how many "K" the kernel will actually process.
792  kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll());
793 
794  // Requantizing GEMMs have the row sums built in to the
795  // transposed data, so the stride between rows is 4 bytes
796  // larger than the (rounded) K value.
797 
798  if(std::is_same<OutputStage, Requantize32>::value) {
799  a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
800  } else {
801  a_panel_stride = kern_k;
802  }
803  }
804 
805  /* Do the actual work. */
806  for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
807  unsigned int first_m = (batch == batch_0) ? m_0 : 0;
808  unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
809 
810  const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
811 
812  if (first_m >= last_m)
813  continue;
814 
815  // For the merge case we need to do this out_height() rows
816  // at a time, as that is the size of our intermediate
817  // buffer. If we are not doing that, we can do all the
818  // relevant rows in one go.
819  unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
820 
821  // But in the case where we have an accumulation buffer, we can't do that after all, unless
822  // there is no N blocking.
823  if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
824  m_step = strategy::out_height();
825  }
826 
827  for (unsigned int y=first_m; y<last_m; y+=m_step) {
828  unsigned int ymax = std::min(_Msize, y + m_step);
829 
830  const bool first_pass = (current.k0() == 0);
831  const bool last_pass = (current.kmax() == _Ktotal);
832 
833  // Pointer to appropriate part of result array.
834  Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
835 
836  // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel
837  // to write things into the accumulation buffer instead, except on the last pass.
838  if (_accumulation_buffer && !last_pass) {
839  result_ptr = nullptr;
840  }
841 
842  // Perform the kernel and merge step, either separately or together as required.
844  #ifdef CYCLE_PROFILING
845  prof,
846  #endif
847  // Strategy and panel pointers
848  strat, a_ptr, b_panel, c_panel,
849  // Result buffer pointers
850  result_ptr, this->_ldc,
851  // K size, and M/N ranges
852  kern_k, y, ymax, current.x0(), current.xmax(),
853  // Only do bias on the first pass
854  ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
855  // Only do activation on the last pass, and accumulation on any non-first pass.
856  (last_pass ? _act : Activation()), !first_pass,
857  // Pass in quantization parameters for requantizing kernels (others will ignore)
858  _os, col_bias + (current.multi() * _Nsize),
859  // Accumulation buffer
860  get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
861 
862  a_ptr += (strategy::out_height() * a_panel_stride);
863  }
864  }
865 
866  b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
867  }
868  }
869  }
870 
871  // Interface implementation - working space
872  size_t get_working_size() const override {
873  // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer.
874  size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
875 
876  size += 128; // Add on two cache lines extra for alignment.
877 
878  return size;
879  }
880 
881  void set_working_space(void *working_space) override {
882  // Make sure everything ends up cache line aligned
883  int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
884  intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
885 
886  size_t diff=0;
887 
888  if (working_space_int & 0x3F) {
889  diff = 0x40 - (working_space_int & 0x3F);
890  }
891 
892  working_space_bytes += diff;
893  working_space_int += diff;
894 
895  // Pretransposed case: just set internal pointer to parameter value.
896  _working_space = reinterpret_cast<void *>(working_space_bytes);
897 
898  // Set up accumulation buffer
899  if (get_accumulation_buffer_size() > 0) {
900  intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
901  // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length)
902  if (acc_buff_int & 0x3F) {
903  acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
904  }
905  _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
906  } else {
907  _accumulation_buffer = nullptr;
908  }
909  }
910 
911  // Interface implementation - pretransposed
912  bool B_is_pretransposed() const override {
913  return true;
914  }
915 
916  bool B_pretranspose_required() const override {
917  return (_B_transposed==nullptr);
918  }
919 
920  size_t get_B_pretransposed_array_size() const override {
921  unsigned int x_size = roundup(_Nsize, strategy::out_width());
922 
923  return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
924  }
925 
926  void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
927  if (std::is_same<OutputStage, Requantize32>::value) {
928  col_bias = reinterpret_cast<int32_t *>(in_buffer);
929 
930  Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
931 
932  for (unsigned int i=0; i<_nmulti; i++) {
933  // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
934  compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
935  }
936  }
937 
938  // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
939  uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
940  Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
941  _B_transposed = buffer;
942 
943  blockwalker current(*this);
944  strategy strat(_ci);
945 
946  do {
947  /* Figure out the size of each block. */
948  unsigned int k_size = (current.kmax() - current.k0());
949 
950  // We need to insert padding at the end of each K section.
951  // The computation needed is a little delicate - the coordinates from the block walker are expressed in
952  // terms of the full, padded, _Ktotal.
953  // But we need to transform each section with reference to the original, unpadded, input, letting the
954  // transform pad each section as needed.
955 
956  // This is needed for computations below.
957  const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
958 
959  // The expected output format is also an entire <out_width> columns interleaved, then the next set of
960  // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
961  // a time.
962  for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
963  unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
964 
965  // Track where we are and how much work is left.
966  unsigned int kpos = current.k0();
967  unsigned int kleft = k_size;
968 
969  while (kleft) {
970  // Which section are we in? Based on the rounded-up section size.
971  unsigned int k_section_base = kpos / rounded_section_size;
972  // How far into the section are we?
973  unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
974 
975  // We will either copy the rest of this section, or to the end of the requested length.
976  unsigned int k_length = std::min(_Ksize - k_offset, kleft);
977 
978  strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
979  x0, xmax,
980  (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
981  (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
982 
983  // We need to modify our position based on the ROUNDED version of what we just did.
984  unsigned int padded_length = roundup(k_length, strategy::k_unroll());
985 
986  buffer += strategy::out_width() * padded_length;
987 
988  kpos += padded_length;
989  kleft -= padded_length;
990  }
991  }
992  } while (current.advance());
993  }
994 
995  void set_pretransposed_B_data(void *in_buffer) override {
996  // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
997  uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
998  _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
999  col_bias = reinterpret_cast<int32_t *>(in_buffer);
1000  }
1001 
1002  void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
1003  if (std::is_same<OutputStage, Requantize32>::value) {
1004  Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
1005 
1006  qp->bias = bias;
1007  qp->bias_multi_stride = bias_multi_stride;
1008  }
1009  }
1010 
1011  void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
1012  assert(string_len == _Ksize);
1013  _indirect_buf = ptr;
1014  }
1015 
1016  void set_convolution_parameters(ConvolutionParameters parms) override {
1017  assert(parms.input_channels == _Ksize);
1018  _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
1019  }
1020 
1021  // Estimate cycles for given problem given provided parameters
1022  static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
1023  unsigned int k_blocks = iceildiv(args._Ksize, get_k_block_size(args));
1024 
1025  uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
1026  uint64_t prepare_bytes = static_cast<uint64_t>(args._nbatches) * args._nmulti * roundup(args._Msize, strategy::out_height()) * roundup(args._Ksize, strategy::k_unroll()) * sizeof(Toi);
1027  uint64_t merge_bytes = static_cast<uint16_t>(args._nbatches) * args._nmulti * k_blocks * roundup(args._Msize, strategy::out_height()) * roundup(args._Nsize, strategy::out_width()) * sizeof(Tr);
1028 
1029  float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
1030  float prepare_cycles = static_cast<float>(prepare_bytes) / params.prepare_bytes_cycle;
1031  float merge_cycles = static_cast<float>(merge_bytes) / params.merge_bytes_cycle;
1032 
1033  float total_cycles = mac_cycles + prepare_cycles + merge_cycles;
1034 
1035  // We can't thread over multis or width, which makes this a poor
1036  // choice in many threaded cases. Penalize that here.
1037  float parallelism_available = static_cast<float>(iceildiv(args._Msize, strategy::out_height()) * args._nbatches) * 0.9f;
1038 
1039  if (parallelism_available < args._maxthreads) {
1040  total_cycles *= (static_cast<float>(args._maxthreads) / parallelism_available);
1041  }
1042 
1043  return static_cast<uint64_t>(total_cycles);
1044  }
1045 };
1046 
1047 // Aliases for the variations
1048 template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
1050 
1051 template<typename strategy, typename To, typename Tr>
1053 
1054 template<typename strategy, typename To, typename Tr>
1056 
1057 } // namespace arm_gemm
T roundup(const T a, const T b)
Definition: utils.hpp:45
ndrange_t get_window_size() const override
unsigned int M
GemmInterleaved(const GemmArgs &args, const OutputStage &os)
T iceildiv(const T a, const T b)
Definition: utils.hpp:40
void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
Definition: Types.h:70
void set_indirect_parameters(size_t string_len, const To *const *const *ptr) override
decltype(strategy::transforms) typedef type
unsigned int N
size_t get_B_pretransposed_array_size() const override
void advance(CharPosition &pos, char ch)
Definition: MLGOParser.cpp:147
GemmInterleaved(const GemmArgs &args)
size_t get_working_size() const override
void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290
void set_pretransposed_B_data(void *in_buffer) override
void set_convolution_parameters(ConvolutionParameters parms) override
#define ROUND_UP(x)
void requantize_block_32(const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
bool B_pretranspose_required() const override
void set_nthreads(int nthreads) override
void set_working_space(void *working_space) override
void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
__kernel void accumulate(__global uchar *input_ptr, uint input_stride_x, uint input_step_x, uint input_stride_y, uint input_step_y, uint input_offset_first_element_in_bytes, __global uchar *accu_ptr, uint accu_stride_x, uint accu_step_x, uint accu_stride_y, uint accu_step_y, uint accu_offset_first_element_in_bytes)
This function accumulates an input image into output image.
Definition: accumulate.cl:41
bool B_is_pretransposed() const override
static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params)
void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override