Main execute member fucntion.
621 #ifdef CYCLE_PROFILING 626 assert(_B_transposed);
627 assert(_working_space);
628 int8_t *working_space_bytes =
reinterpret_cast<int8_t *
>(_working_space);
631 intptr_t working_space_v =
reinterpret_cast<intptr_t
>(_working_space);
632 if (working_space_v & 0x3f) {
633 intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
634 working_space_bytes += alignment_offset;
639 const auto start = work_range.get_position(0);
640 const auto end = work_range.get_position_end(0);
643 const unsigned int window_per_batch = _Mround / strategy::out_height();
644 unsigned int batch_0 = start / window_per_batch;
645 unsigned int batch_end =
end / window_per_batch;
649 if (_thread_columns) {
650 const auto start_x = work_range.get_position(1) * strategy::out_width();
651 const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
653 Tri *
const c_panel =
reinterpret_cast<Tri *
>(working_space_bytes + (threadid * get_c_working_size()));
654 Toi *
const a_panel =
reinterpret_cast<Toi *
>(working_space_bytes + (_maxthreads * get_c_working_size()) +
655 (threadid *
sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
657 for (
unsigned int multi=0; multi<_nmulti; multi++) {
658 for (
unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
659 unsigned int kmax=std::min(k0+_k_block, _Ktotal);
661 unsigned int rounded_width =
roundup(_Nsize, strategy::out_width());
663 const bool first_pass = (k0==0);
664 const bool last_pass = (kmax==_Ktotal);
667 unsigned int kern_k =
roundup(kmax - k0, strategy::k_unroll());
669 const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
671 unsigned int batch = batch_0;
672 unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
674 for (
unsigned int p=start; p<
end; p++) {
675 unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
679 #ifdef CYCLE_PROFILING 680 auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) *
sizeof(Toi));
684 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
686 if (_indirect_buf !=
nullptr) {
687 transforms.PrepareA_indirect(a_panel,
688 _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
689 _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
690 }
else if (_convolver) {
691 transforms.PrepareA_convolution(a_panel,
692 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
693 this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
695 transforms.PrepareA(a_panel,
696 this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
697 this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
703 #ifdef CYCLE_PROFILING
707 strat, a_panel, b_ptr, c_panel,
709 this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
711 kern_k, start_row, end_row, start_x, end_x,
713 ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) :
nullptr),
715 (last_pass ? _act :
Activation()), !first_pass,
717 _os, col_bias + (multi * _Nsize),
719 static_cast<Tab *>(
nullptr));
722 start_row += strategy::out_height();
723 if (start_row >= _Msize) {
731 blockwalker current(*
this);
734 unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
735 unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
741 Toi *
const a_panel =
reinterpret_cast<Toi *
>(working_space_bytes + (_maxthreads * get_c_working_size()));
742 Tri *
const c_panel =
reinterpret_cast<Tri *
>(working_space_bytes + (threadid * get_c_working_size()));
745 b_panel = _B_transposed;
757 unsigned int kern_k = 0;
758 unsigned int a_panel_stride = 0;
760 for (;!current.done();current.advance()) {
761 if (current.newkblock()) {
762 #ifdef CYCLE_PROFILING 763 auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) *
sizeof(Toi));
767 typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>
::type transforms;
769 for (
unsigned int batch = batch_0; batch <= batch_end; batch++) {
770 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
771 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
773 if (first_m >= last_m)
776 if (_indirect_buf !=
nullptr) {
777 transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
778 _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
779 _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
780 }
else if (_convolver) {
781 transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
782 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
783 this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
785 transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
786 this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
787 this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
792 kern_k =
roundup(current.kmax() - current.k0(), strategy::k_unroll());
798 if(std::is_same<OutputStage, Requantize32>::value) {
799 a_panel_stride = kern_k + (
sizeof(int32_t) /
sizeof(Toi));
801 a_panel_stride = kern_k;
806 for (
unsigned int batch = batch_0; batch <= batch_end; batch++) {
807 unsigned int first_m = (batch == batch_0) ? m_0 : 0;
808 unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
810 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
812 if (first_m >= last_m)
819 unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
823 if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
824 m_step = strategy::out_height();
827 for (
unsigned int y=first_m; y<last_m; y+=m_step) {
828 unsigned int ymax = std::min(_Msize, y + m_step);
830 const bool first_pass = (current.k0() == 0);
831 const bool last_pass = (current.kmax() == _Ktotal);
834 Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
838 if (_accumulation_buffer && !last_pass) {
839 result_ptr =
nullptr;
844 #ifdef CYCLE_PROFILING
848 strat, a_ptr, b_panel, c_panel,
850 result_ptr, this->_ldc,
852 kern_k, y, ymax, current.x0(), current.xmax(),
854 ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) :
nullptr),
856 (last_pass ? _act :
Activation()), !first_pass,
858 _os, col_bias + (current.multi() * _Nsize),
860 get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
862 a_ptr += (strategy::out_height() * a_panel_stride);
866 b_panel += (
roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
T roundup(const T a, const T b)
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
decltype(strategy::transforms) typedef type
void end(TokenStream &in, bool &valid)