86 #ifdef CYCLE_PROFILING 89 strategy strat(_args._ci);
91 const auto start = work_range.get_position(0);
92 const auto end = work_range.get_position_end(0);
95 const unsigned int window_per_multi =
iceildiv(_args._Nsize, strategy::out_width());
96 const unsigned int multi_0 =
start / window_per_multi;
97 const unsigned int multi_end =
end / window_per_multi;
100 const unsigned int n_0 = (
start - (multi_0 * window_per_multi)) * strategy::out_width();
101 const unsigned int n_max = (
end - (multi_end * window_per_multi)) * strategy::out_width();
103 static_assert(std::is_same<Tr, Tri>::value,
"GemvPretransposed: Result types must be the same.");
105 for (
unsigned int multi=multi_0; multi<=multi_end; multi++) {
106 const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
107 const unsigned int n_end = (multi==multi_end) ? n_max : _args._Nsize;
109 if (n_end <= n_start)
112 for (
unsigned int k0=0; k0<_args._Ksize; k0+=k_block) {
113 unsigned int kmax = std::min(k0 + k_block, _args._Ksize);
115 for (
unsigned int n=n_start; n<n_end; n+=n_block) {
116 unsigned int nmax = std::min(n + n_block, n_end);
117 #ifdef CYCLE_PROFILING 118 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
120 strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0,
121 _B_pretransposed + (multi * _buffer_per_multi) + (n *
roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
122 this->_Cptr + (multi * this->_C_multi_stride) + n,
123 (nmax - n), (kmax-k0),
124 this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n :
nullptr,
125 _args._act, (k0 != 0));
T roundup(const T a, const T b)
T iceildiv(const T a, const T b)
void end(TokenStream &in, bool &valid)