55 template <
typename TypeInput,
typename TypeOutput>
63 std::vector<IScheduler::Workload> workloads(num_threads);
64 for(
unsigned int t = 0;
t < num_threads; ++
t)
66 workloads[
t] = [ = ](
const ThreadInfo &
info)
68 const unsigned int start = (
info.thread_id * wsize) / num_threads;
69 const unsigned int end = ((
info.thread_id + 1) * wsize) / num_threads;
87 void operator()(
void *x)
104 Params extract_parameters(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
108 p.M = d->tensor_shape().y();
109 p.K = a->tensor_shape().x();
110 p.N = d->tensor_shape().x();
119 p.sections =
b->tensor_shape()[2] *
b->tensor_shape()[3];
123 p.multis =
b->tensor_shape().z();
124 p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
128 if(
info.depth_output_gemm3d != 0)
130 p.M = d->tensor_shape().y() * d->tensor_shape().z();
131 p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
140 const int granule_threshold = 200;
141 IScheduler::Hints scheduling_hint = IScheduler::Hints(
Window::DimX);
157 return scheduling_hint;
161 template <
typename TypeInput,
typename TypeOutput,
class OutputStage = arm_gemm::Nothing>
162 class Fallback :
public CpuGemmAssemblyDispatch::IFallback
166 ~Fallback() =
default;
178 void configure(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *c, ITensorInfo *d,
180 const OutputStage &os = {});
196 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(
const std::vector<int32_t> &shifts,
197 const std::vector<int32_t> &multipliers);
200 void run(ITensorPack &tensors)
override;
201 void prepare(ITensorPack &tensors)
override;
202 bool is_configured()
const override;
204 bool isVarWeightsKernel()
const override
206 if(!_gemm_kernel_asm)
215 AsmGemmWorkspace = 0,
227 void configure_indirect(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info);
229 void prepare_indirect_buffer(ITensorPack &tensors);
232 std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{
nullptr };
234 std::unique_ptr<INEKernel> _optimised_kernel{
nullptr };
236 TensorInfo _workspace_info{};
238 TensorInfo _pretranspose_info{};
240 bool _is_prepared{
false };
242 AsmGemmInfo _gemm_info{};
246 std::vector<int32_t> _shifts{};
247 std::vector<int32_t> right_shifts{};
248 std::vector<int32_t> left_shifts{};
250 std::vector<int32_t> _multipliers{};
252 std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
253 std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
254 std::vector<TypeInput> _indirect_pad{};
257 bool _B_pretranspose_required{
false };
258 bool _is_b_constant{
true };
259 bool _is_c_constant{
true };
262 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
263 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
264 Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(
const std::vector<int32_t> &shifts,
const std::vector<int32_t> &multipliers)
266 _multipliers = multipliers;
268 bool need_left =
false;
269 for(
const auto s : _shifts)
271 left_shifts.push_back(std::max(-s, int32_t(0)));
272 right_shifts.push_back(std::min(-s, int32_t(0)));
273 if(s < 0 && !need_left)
278 return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
281 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
282 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
285 const TypeInput *A_ptr =
reinterpret_cast<TypeInput *
>(a->buffer());
287 const int batches = a->info()->tensor_shape().total_size_upper(3);
288 const size_t stride_A = a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
289 const size_t batch_stride_A = a->info()->strides_in_bytes()[3] /
sizeof(TypeInput);
290 const size_t multi_stride_A = a->info()->strides_in_bytes()[4] /
sizeof(TypeInput);
294 const size_t batch_stride = batch_size /
sizeof(TypeInput);
295 const int multi_size = batch_size *
batches;
296 const size_t multi_stride = multi_size /
sizeof(TypeInput);
302 for(int64_t output_y = 0; output_y < _cp.
output_height; output_y++)
304 for(int64_t output_x = 0; output_x < _cp.
output_width; output_x++)
306 int64_t output_xy = (output_y * _cp.
output_width) + output_x;
308 for(int64_t kernel_y = 0; kernel_y < _cp.
kernel_height; kernel_y++)
310 for(int64_t kernel_x = 0; kernel_x < _cp.
kernel_width; kernel_x++)
314 int64_t kernel_xy = (kernel_y * _cp.
kernel_width) + kernel_x;
315 int64_t input_xy = (input_y * _cp.
input_width) + input_x;
319 _indirect_buf.get()[
m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
323 _indirect_buf.get()[
m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] =
324 A_ptr + (
m * multi_stride_A +
b * batch_stride_A + input_xy * stride_A);
334 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
335 void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
342 zeropad = a->quantization_info().uniform().offset;
345 const int64_t
input_width =
static_cast<int64_t
>(a->tensor_shape()[1]);
346 const int64_t
input_height =
static_cast<int64_t
>(a->tensor_shape()[2]);
347 const int64_t input_channels =
static_cast<int64_t
>(a->tensor_shape()[0]);
348 const int64_t kernel_width =
static_cast<int64_t
>(
b->tensor_shape()[2]);
349 const int64_t kernel_height =
static_cast<int64_t
>(
b->tensor_shape()[3]);
350 const int64_t output_width =
static_cast<int64_t
>(d->tensor_shape()[1]);
351 const int64_t output_height =
static_cast<int64_t
>(d->tensor_shape()[2]);
354 info.ps_info.stride().first,
info.ps_info.stride().second,
info.padding_top,
info.padding_left, zeropad
359 _gemm_kernel_asm->set_convolution_parameters(_cp);
364 const unsigned int multis = 1;
365 const unsigned int batches = a->tensor_shape().total_size_upper(3);
369 using TypeInputPtr = TypeInput *;
370 const int batch_size = kernel_hw * output_hw *
sizeof(TypeInputPtr);
371 const size_t batch_stride = batch_size /
sizeof(TypeInputPtr);
372 const int multi_size = batch_size *
batches;
373 const size_t multi_stride = multi_size /
sizeof(TypeInputPtr);
375 _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
reinterpret_cast<const TypeInput **
>(malloc(multi_size *
multis)));
376 _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
reinterpret_cast<const TypeInput *
const **
>(malloc(
sizeof(TypeInput **) * kernel_hw *
multis *
batches)));
377 _indirect_pad = std::vector<TypeInput>(_cp.
input_channels, TypeInput(zeropad));
385 for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
387 (_indirect_arg.get())[pos++] = _indirect_buf.get() +
m * multi_stride +
b * batch_stride + kernel_xy * output_hw;
392 _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
396 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
399 const OutputStage &os)
403 _is_b_constant =
b->are_values_constant();
404 _is_c_constant = c ? c->are_values_constant() :
true;
406 _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(
args, os);
407 if(_gemm_kernel_asm ==
nullptr)
416 auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
418 acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.
filter);
419 const size_t workspace_size = _gemm_kernel_asm->get_working_size();
420 const unsigned int alignment = 4096;
421 _workspace_info = TensorInfo(TensorShape(workspace_size), 1,
DataType::U8);
422 _aux_mem[AsmGemmWorkspace] =
MemoryInfo(
offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
427 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
428 if(window_size <
static_cast<unsigned int>(
args._maxthreads))
430 _gemm_kernel_asm->set_nthreads(window_size);
434 _optimised_kernel = std::move(acl_gemm_wrapper);
437 if(_gemm_kernel_asm->B_pretranspose_required())
440 const unsigned int alignment = 128;
441 const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
442 _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1,
DataType::U8);
443 _aux_mem[Pretranspose] =
MemoryInfo(
offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
444 _B_pretranspose_required =
true;
454 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
455 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
465 _gemm_kernel_asm->set_quantized_bias(
reinterpret_cast<const int32_t *
>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
469 if(_gemm_kernel_asm->B_pretranspose_required())
473 const int ldb =
b->info()->strides_in_bytes().y() /
b->info()->element_size();
474 const auto in1_ptr =
reinterpret_cast<const TypeInput *
>(
b->buffer() +
b->info()->offset_first_element_in_bytes());
475 const int multi_stride_b =
b->info()->strides_in_bytes().z() /
b->info()->element_size();
477 CpuAuxTensorHandler pretranspose(
offset_int_vec(Pretranspose), _pretranspose_info, tensors,
false);
479 run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b,
NEScheduler::get().
num_threads());
486 prepare_indirect_buffer(tensors);
493 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
494 bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured()
const
496 return _optimised_kernel !=
nullptr;
499 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
505 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
513 int lda = a->info()->strides_in_bytes().y() / a->info()->element_size();
515 const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size();
517 const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
518 const size_t a_multi_idx = a_batch_idx + 1;
519 const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
520 const size_t d_multi_idx = d_batch_idx + 1;
522 int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / a->info()->element_size();
523 const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / d->info()->element_size();
525 int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / a->info()->element_size();
526 int multi_stride_b = 0;
527 const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
529 auto in0_ptr =
reinterpret_cast<const TypeInput *
>(a->buffer() + a->info()->offset_first_element_in_bytes());
530 const TypeInput *in1_ptr =
nullptr;
531 auto out_ptr =
reinterpret_cast<TypeOutput *
>(d->buffer() + d->info()->offset_first_element_in_bytes());
534 if(!_gemm_kernel_asm->B_is_pretransposed())
536 ldb =
b->info()->strides_in_bytes().y() /
b->info()->element_size();
537 multi_stride_b =
b->info()->strides_in_bytes().z() /
b->info()->element_size();
538 in1_ptr =
reinterpret_cast<const TypeInput *
>(
b->buffer() +
b->info()->offset_first_element_in_bytes());
542 if((
b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() ==
DataType::S32))
546 _gemm_kernel_asm->set_quantized_bias(
reinterpret_cast<const int32_t *
>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
550 if(_B_pretranspose_required)
552 const int ldb =
b->info()->strides_in_bytes().y() /
b->info()->element_size();
553 const auto b_ptr =
reinterpret_cast<const TypeInput *
>(
b->buffer() +
b->info()->offset_first_element_in_bytes());
554 const int multi_stride_b =
b->info()->strides_in_bytes().z() /
b->info()->element_size();
556 CpuAuxTensorHandler pretranspose(
offset_int_vec(Pretranspose), _pretranspose_info, tensors,
true);
561 _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
565 run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b,
NEScheduler::get().
num_threads());
570 const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.
method, d->info()->data_type());
573 CpuAuxTensorHandler workspace(
offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors,
false);
574 if(workspace.get()->buffer() !=
nullptr)
576 _gemm_kernel_asm->set_working_space(
reinterpret_cast<void *
>(workspace.get()->buffer()));
577 const unsigned int split_dim = scheduling_hint.split_dimension();
578 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
580 if(window_size < num_threads)
582 num_threads = window_size;
587 const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
588 num_threads = std::min(num_iterations, num_threads);
590 _gemm_kernel_asm->set_nthreads(num_threads);
597 TypeOutput *
bias =
nullptr;
600 bias =
reinterpret_cast<TypeOutput *
>(c->buffer() + c->info()->offset_first_element_in_bytes());
612 _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
613 in1_ptr, ldb, multi_stride_b,
614 out_ptr, ldd, batch_stride_d, multi_stride_d,
620 template <
typename TypeInput,
typename TypeOutput>
621 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm,
622 const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *c, ITensorInfo *d,
625 Params p = extract_parameters(a,
b, d,
info);
631 arm_gemm::GemmArgs args(&
ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
info.fixed_format,
info.fast_mode, &cfg);
634 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
635 fallback->configure(a,
b, c, d,
args,
info);
639 template <
typename TypeInput,
typename TypeOutput>
640 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm,
641 const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *c, ITensorInfo *d,
645 Params p = extract_parameters(a,
b, d,
info);
651 arm_gemm::GemmArgs args(&
ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
info.fixed_format,
info.fast_mode, &cfg);
654 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
657 const int32_t negation =
info.negated_offsets ? 1 : -1;
658 const int32_t a_offset = -a->quantization_info().uniform().offset * negation;
659 const int32_t b_offset = -
b->quantization_info().uniform().offset * negation;
660 const GEMMLowpOutputStageInfo os_info =
info.output_stage;
663 if(os_info.gemmlowp_shifts.size() > 1)
665 const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
667 a_offset, b_offset, os_info.gemmlowp_offset,
668 (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) :
nullptr,
669 std::get<2>(requantize_data),
670 std::get<3>(requantize_data),
671 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
676 a_offset, b_offset, os_info.gemmlowp_offset,
677 -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
678 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
682 fallback->configure(a,
b, c, d,
args,
info, gemm_requant_info);
698 Params p = extract_parameters(a,
b, d,
info);
704 arm_gemm::GemmArgs args(&
ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
info.fixed_format,
info.fast_mode, &cfg);
709 "We could not find an optimized kernel for F32 input");
717 "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
722 "We could not find an optimized kernel for U8 input and U8 output");
730 "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
735 "We could not find an optimized kernel for S8 input and S8 output");
739 #if defined(ARM_COMPUTE_ENABLE_BF16)
743 "We could not find an optimized kernel for BFLOAT16 input and F32 output");
747 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
750 "We could not find an optimized kernel for F16 input and F16 output");
796 "Only QASYMM8/S32 output supported for QASYMM8 input");
805 "The format expected by the kernel does not correspond with the one requested by the user.");
830 create_arm_gemm<float, float>(_arm_gemm, a,
b, c, d, act,
info);
837 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a,
b, c, d, act,
info);
841 create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a,
b, c, d, act,
info);
848 create_arm_gemm<int8_t, int32_t>(_arm_gemm, a,
b, c, d, act,
info);
852 create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a,
b, c, d, act,
info);
856 #if defined(ARM_COMPUTE_ENABLE_BF16)
858 create_arm_gemm<bfloat16, float>(_arm_gemm, a,
b, c, d, act,
info);
861 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
863 create_arm_gemm<float16_t, float16_t>(_arm_gemm, a,
b, c, d, act,
info);
874 _arm_gemm->prepare(tensors);
879 return _arm_gemm && _arm_gemm->is_configured();
885 _arm_gemm->run(tensors);
891 return _arm_gemm->workspace();