33 #ifdef CYCLE_PROFILING
34 #include "profiler.hpp"
41 template<
typename OutputStage>
42 class run_gemv_kernel {
44 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
47 const Tlo *A_ptr,
const Tro *B_ptr, Tr *c_ptr,
50 const OutputStage &os,
const int32_t *col_bias,
unsigned int col_base
55 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
58 const Tlo *A_ptr,
const Tro *B_ptr, Tr *C_ptr,
61 const Nothing &,
const int32_t *,
unsigned int
64 strat.kernel(A_ptr, B_ptr, C_ptr,
N,
K,
bias, act, Accumulate);
68 template<
typename strategy,
typename Tlo,
typename Tro,
typename Tr>
71 const Tlo *A_ptr,
const Tro *B_ptr, Tr *C_ptr,
74 const Requantize32 &qp,
const int32_t *col_bias,
unsigned int col_base
77 strat.kernel(A_ptr, B_ptr, C_ptr,
N,
K, &qp, col_bias + col_base, col_base);
87 template<
typename strategy,
typename To,
typename Tr,
typename OutputStage=Nothing>
89 typedef typename strategy::operand_type Toi;
90 typedef typename strategy::result_type Tri;
94 const unsigned int _buffer_per_multi;
96 unsigned int k_block=0;
97 unsigned int n_block=0;
99 const Toi *_B_pretransposed =
nullptr;
104 int32_t *col_bias =
nullptr;
107 unsigned int get_col_sum_size()
const {
108 if(std::is_same<OutputStage, Requantize32>::value) {
121 _buffer_per_multi(
roundup(
args._Ksize, strategy::k_unroll()) *
roundup(
args._Nsize, strategy::out_width())),
124 if (strategy::supports_accumulate() &&
args._cfg &&
args._cfg->inner_block_size) {
125 k_block =
args._cfg->inner_block_size;
127 k_block =
args._Ksize;
130 if (
args._cfg &&
args._cfg->outer_block_size) {
131 n_block =
args._cfg->outer_block_size;
133 n_block =
args._Nsize;
144 #ifdef CYCLE_PROFILING
153 const unsigned int window_per_multi =
iceildiv(_args.
_Nsize, strategy::out_width());
154 const unsigned int multi_0 = start / window_per_multi;
155 const unsigned int multi_end =
end / window_per_multi;
158 const unsigned int n_0 = (start - (multi_0 * window_per_multi)) * strategy::out_width();
159 const unsigned int n_max = (
end - (multi_end * window_per_multi)) * strategy::out_width();
161 static_assert(std::is_same<Tr, Tri>::value,
"GemvPretransposed: Result types must be the same.");
163 for (
unsigned int multi=multi_0; multi<=multi_end; multi++) {
164 const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
165 const unsigned int n_end = (multi==multi_end) ? n_max : _args.
_Nsize;
167 if (n_end <= n_start)
170 for (
unsigned int k0=0; k0<_args.
_Ksize; k0+=k_block) {
171 unsigned int kmax = std::min(k0 + k_block, _args.
_Ksize);
173 for (
unsigned int n=n_start; n<n_end; n+=n_block) {
174 unsigned int nmax = std::min(n + n_block, n_end);
175 #ifdef CYCLE_PROFILING
176 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
179 _B_pretransposed + (multi * _buffer_per_multi) + (n *
roundup(_args.
_Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
180 this->_Cptr + (multi * this->_C_multi_stride) + n,
181 (nmax - n), (kmax-k0),
182 this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n :
nullptr,
183 _args.
_act, (k0 != 0),
184 _os, col_bias, n + (_args.
_Nsize * multi));
197 return (_B_pretransposed ==
nullptr);
201 return _buffer_per_multi * _args.
_nmulti *
sizeof(To) + get_col_sum_size();
204 void requantize_bias(
void *in_buffer,
const To *B,
const int ldb,
const int B_multi_stride)
override {
207 if (std::is_same<OutputStage, Requantize32>::value) {
208 col_bias =
reinterpret_cast<int32_t *
>(in_buffer);
212 for (
unsigned int i=0; i<_args.
_nmulti; i++) {
222 uintptr_t buffer_int =
reinterpret_cast<uintptr_t
>(buffer);
223 Toi *B_buffer =
reinterpret_cast<Toi *
>(buffer_int + get_col_sum_size());
227 for (
unsigned int multi=0; multi<_args.
_nmulti; multi++) {
228 strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args.
_Nsize, 0, _args.
_Ksize);
231 _B_pretransposed = B_buffer;
235 _B_pretransposed =
reinterpret_cast<Toi *
>(buffer);
244 c.
filter = get_type_name<strategy>();