46 void operator()(
void *x)
63 Params extract_parameters(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
67 p.M = d->tensor_shape().y();
68 p.K = a->tensor_shape().x();
69 p.N = d->tensor_shape().x();
78 p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
82 p.multis = b->tensor_shape().z();
83 p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
87 if(info.depth_output_gemm3d != 0)
89 p.M = d->tensor_shape().y() * d->tensor_shape().z();
90 p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
99 const int granule_threshold = 200;
100 IScheduler::Hints scheduling_hint = IScheduler::Hints(
Window::DimX);
116 return scheduling_hint;
120 template <
typename TypeInput,
typename TypeOutput,
class OutputStage = arm_gemm::Nothing>
121 class Fallback :
public CpuGemmAssemblyDispatch::IFallback
125 ~Fallback() =
default;
137 void configure(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *c, ITensorInfo *d,
139 const OutputStage &os = {});
155 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(
const std::vector<int32_t> &shifts,
156 const std::vector<int32_t> &multipliers);
159 void run(ITensorPack &tensors)
override;
160 void prepare(ITensorPack &tensors)
override;
161 bool is_configured()
const override;
167 AsmGemmWorkspace = 0,
179 void configure_indirect(
const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *d,
const AsmGemmInfo &
info);
181 void prepare_indirect_buffer(ITensorPack &tensors);
184 std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{
nullptr };
186 std::unique_ptr<INEKernel> _optimised_kernel{
nullptr };
188 TensorInfo _workspace_info{};
190 TensorInfo _pretranspose_info{};
192 bool _is_prepared{
false };
194 AsmGemmInfo _gemm_info{};
198 std::vector<int32_t> _shifts{};
199 std::vector<int32_t> right_shifts{};
200 std::vector<int32_t> left_shifts{};
202 std::vector<int32_t> _multipliers{};
204 std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
205 std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
206 std::vector<TypeInput> _indirect_pad{};
209 bool _B_pretranspose_required{
false };
210 bool _is_b_constant{
true };
211 bool _is_c_constant{
true };
214 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
215 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
216 Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(
const std::vector<int32_t> &shifts,
const std::vector<int32_t> &multipliers)
218 _multipliers = multipliers;
220 bool need_left =
false;
221 for(
const auto s : _shifts)
223 left_shifts.push_back(std::max(-s, int32_t(0)));
224 right_shifts.push_back(std::min(-s, int32_t(0)));
225 if(s < 0 && !need_left)
230 return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
233 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
234 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
237 const TypeInput *A_ptr =
reinterpret_cast<TypeInput *
>(a->buffer());
239 const int batches = a->info()->tensor_shape().total_size_upper(3);
240 const size_t stride_A = a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
241 const size_t batch_stride_A = a->info()->strides_in_bytes()[3] /
sizeof(TypeInput);
242 const size_t multi_stride_A = a->info()->strides_in_bytes()[4] /
sizeof(TypeInput);
246 const size_t batch_stride = batch_size /
sizeof(TypeInput);
247 const int multi_size = batch_size *
batches;
248 const size_t multi_stride = multi_size /
sizeof(TypeInput);
254 for(int64_t output_y = 0; output_y < _cp.
output_height; output_y++)
256 for(int64_t output_x = 0; output_x < _cp.
output_width; output_x++)
258 int64_t output_xy = (output_y * _cp.
output_width) + output_x;
260 for(int64_t kernel_y = 0; kernel_y < _cp.
kernel_height; kernel_y++)
262 for(int64_t kernel_x = 0; kernel_x < _cp.
kernel_width; kernel_x++)
266 int64_t kernel_xy = (kernel_y * _cp.
kernel_width) + kernel_x;
267 int64_t input_xy = (input_y * _cp.
input_width) + input_x;
271 _indirect_buf.get()[
m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
275 _indirect_buf.get()[
m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] =
276 A_ptr + (
m * multi_stride_A +
b * batch_stride_A + input_xy * stride_A);
286 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
287 void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
294 zeropad = a->quantization_info().uniform().offset;
297 const int64_t
input_width =
static_cast<int64_t
>(a->tensor_shape()[1]);
298 const int64_t
input_height =
static_cast<int64_t
>(a->tensor_shape()[2]);
299 const int64_t input_channels =
static_cast<int64_t
>(a->tensor_shape()[0]);
300 const int64_t kernel_width =
static_cast<int64_t
>(b->tensor_shape()[2]);
301 const int64_t kernel_height =
static_cast<int64_t
>(b->tensor_shape()[3]);
302 const int64_t output_width =
static_cast<int64_t
>(d->tensor_shape()[1]);
303 const int64_t output_height =
static_cast<int64_t
>(d->tensor_shape()[2]);
306 info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
311 _gemm_kernel_asm->set_convolution_parameters(_cp);
316 const unsigned int multis = 1;
317 const unsigned int batches = a->tensor_shape().total_size_upper(3);
321 using TypeInputPtr = TypeInput *;
322 const int batch_size = kernel_hw * output_hw *
sizeof(TypeInputPtr);
323 const size_t batch_stride = batch_size /
sizeof(TypeInputPtr);
324 const int multi_size = batch_size *
batches;
325 const size_t multi_stride = multi_size /
sizeof(TypeInputPtr);
327 _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
reinterpret_cast<const TypeInput **
>(malloc(multi_size * multis)));
328 _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
reinterpret_cast<const TypeInput *
const **
>(malloc(
sizeof(TypeInput **) * kernel_hw * multis * batches)));
329 _indirect_pad = std::vector<TypeInput>(_cp.
input_channels, TypeInput(zeropad));
335 for(int64_t b = 0; b <
batches; b++)
337 for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
339 (_indirect_arg.get())[pos++] = _indirect_buf.get() +
m * multi_stride + b * batch_stride + kernel_xy * output_hw;
344 _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
348 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
351 const OutputStage &os)
355 _is_b_constant = b->are_values_constant();
356 _is_c_constant = c ? c->are_values_constant() :
true;
358 _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(
args, os);
359 if(_gemm_kernel_asm ==
nullptr)
368 auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
370 acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.
filter);
371 const size_t workspace_size = _gemm_kernel_asm->get_working_size();
372 const unsigned int alignment = 4096;
373 _workspace_info = TensorInfo(TensorShape(workspace_size), 1,
DataType::U8);
374 _aux_mem[AsmGemmWorkspace] =
MemoryInfo(
offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
379 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
380 if(window_size < static_cast<unsigned int>(args._maxthreads))
382 _gemm_kernel_asm->set_nthreads(window_size);
386 _optimised_kernel = std::move(acl_gemm_wrapper);
389 if(_gemm_kernel_asm->B_pretranspose_required())
392 const unsigned int alignment = 128;
393 const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
394 _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1,
DataType::U8);
395 _aux_mem[Pretranspose] =
MemoryInfo(
offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
396 _B_pretranspose_required =
true;
402 configure_indirect(a, b, d, gemm_info);
406 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
407 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
417 _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
421 if(_gemm_kernel_asm->B_pretranspose_required())
423 const int ldb = b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
424 const auto in1_ptr =
reinterpret_cast<const TypeInput *
>(b->buffer() + b->info()->offset_first_element_in_bytes());
425 const int multi_stride_b = b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
427 CpuAuxTensorHandler pretranspose(
offset_int_vec(Pretranspose), _pretranspose_info, tensors,
false);
429 _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
436 prepare_indirect_buffer(tensors);
443 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
444 bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured()
const 446 return _optimised_kernel !=
nullptr;
449 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
455 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
463 int lda = a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
465 const int ldd = d->info()->strides_in_bytes().y() /
sizeof(TypeOutput);
467 const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
468 const size_t a_multi_idx = a_batch_idx + 1;
469 const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
470 const size_t d_multi_idx = d_batch_idx + 1;
472 int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] /
sizeof(TypeInput);
473 const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] /
sizeof(TypeOutput);
475 int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] /
sizeof(TypeInput);
476 int multi_stride_b = 0;
477 const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] /
sizeof(TypeOutput);
479 auto in0_ptr =
reinterpret_cast<const TypeInput *
>(a->buffer() + a->info()->offset_first_element_in_bytes());
480 const TypeInput *in1_ptr =
nullptr;
481 auto out_ptr =
reinterpret_cast<TypeOutput *
>(d->buffer() + d->info()->offset_first_element_in_bytes());
484 if(!_gemm_kernel_asm->B_is_pretransposed())
486 ldb = b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
487 multi_stride_b = b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
488 in1_ptr =
reinterpret_cast<const TypeInput *
>(b->buffer() + b->info()->offset_first_element_in_bytes());
492 if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() ==
DataType::S32))
496 _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
500 if(_B_pretranspose_required)
502 const int ldb = b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
503 const auto b_ptr =
reinterpret_cast<const TypeInput *
>(b->buffer() + b->info()->offset_first_element_in_bytes());
504 const int multi_stride_b = b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
506 CpuAuxTensorHandler pretranspose(
offset_int_vec(Pretranspose), _pretranspose_info, tensors,
true);
511 _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
515 _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
520 const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.
method, d->info()->data_type());
523 CpuAuxTensorHandler workspace(
offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors,
false);
524 if(workspace.get()->buffer() !=
nullptr)
526 _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
527 const unsigned int split_dim = scheduling_hint.split_dimension();
528 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
530 if(window_size < num_threads)
532 num_threads = window_size;
537 const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
538 num_threads = std::min(num_iterations, num_threads);
540 _gemm_kernel_asm->set_nthreads(num_threads);
547 TypeOutput *
bias =
nullptr;
550 bias =
reinterpret_cast<TypeOutput *
>(c->buffer() + c->info()->offset_first_element_in_bytes());
562 _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
563 in1_ptr, ldb, multi_stride_b,
564 out_ptr, ldd, batch_stride_d, multi_stride_d,
570 template <
typename TypeInput,
typename TypeOutput>
571 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm,
572 const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *c, ITensorInfo *d,
575 Params p = extract_parameters(a, b, d, info);
579 arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
582 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
583 fallback->configure(a, b, c, d, args, info);
584 arm_gemm = std::move(fallback);
587 template <
typename TypeInput,
typename TypeOutput>
588 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
589 const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *c, ITensorInfo *d,
593 Params p = extract_parameters(a, b, d, info);
597 arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
600 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
603 const int32_t negation = info.negated_offsets ? 1 : -1;
604 const int32_t a_offset = -a->quantization_info().uniform().offset * negation;
605 const int32_t b_offset = -b->quantization_info().uniform().offset * negation;
606 const GEMMLowpOutputStageInfo os_info = info.output_stage;
609 if(os_info.gemmlowp_shifts.size() > 1)
611 const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
613 a_offset, b_offset, os_info.gemmlowp_offset,
614 (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) :
nullptr,
615 std::get<2>(requantize_data),
616 std::get<3>(requantize_data),
617 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
622 a_offset, b_offset, os_info.gemmlowp_offset,
623 -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
624 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
628 fallback->configure(a, b, c, d, args, info, gemm_requant_info);
629 arm_gemm = std::move(fallback);
643 Params p = extract_parameters(a, b, d, info);
647 arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.
fast_mode);
652 "We could not find an optimized kernel for F32 input");
660 "We could not find an optimized kernel for U8/QASYMM8 input and S32 output");
665 "We could not find an optimized kernel for U8 input and U8 output");
673 "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
678 "We could not find an optimized kernel for S8 input and S32 output");
682 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 686 "We could not find an optimized kernel for BFLOAT16 input and F32 output");
690 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 693 "We could not find an optimized kernel for BFLOAT16 input and F32 output");
755 create_arm_gemm<float, float>(_arm_gemm, a,
b, c, d, act,
info);
762 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a,
b, c, d, act,
info);
766 create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a,
b, c, d, act,
info);
773 create_arm_gemm<int8_t, int32_t>(_arm_gemm, a,
b, c, d, act,
info);
777 create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a,
b, c, d, act,
info);
781 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 783 create_arm_gemm<bfloat16, float>(_arm_gemm, a,
b, c, d, act,
info);
786 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 788 create_arm_gemm<float16_t, float16_t>(_arm_gemm, a,
b, c, d, act,
info);
799 _arm_gemm->prepare(tensors);
804 return _arm_gemm !=
nullptr && _arm_gemm->is_configured();
810 _arm_gemm->run(tensors);
816 return _arm_gemm->workspace();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
ActivationLayerInfo activation_info
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
1 channel, 1 U8 per channel
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Split the workload evenly among the threads.
static constexpr unsigned int split_dimensions_all
When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value then the schedul...
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const size_t input_height
Store the tensor's metadata.
CPUInfo & cpu_info()
Get CPU info.
Activation Layer Information class.
Copyright (c) 2017-2022 Arm Limited.
std::vector< MemoryInfo > MemoryRequirements
1 channel, 1 F16 per channel
Split the workload dynamically using a bucket system.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
16-bit brain floating-point number
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not this function can be used to process the given parameters.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
If supported create a Compute Library function else fallback to the arm_gemm function.
quantized, asymmetric fixed-point 8-bit number unsigned
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
bool is_configured() const
Was the function successfully configured ?
void prepare(ITensorPack &tensors) override
Prepare the function for executing.
quantized, symmetric per channel fixed-point 8-bit number
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
static Status has_opt_impl(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not there is an optimal assembly implementation that can be used to process the ...
int offset_int_vec(int offset)
quantized, asymmetric fixed-point 8-bit number signed
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure...
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in its pool.
im2col_func configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias)
CpuGemmAssemblyDispatch()
Constructor.
DataType
Available data types.
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
static IScheduler & get()
Access the scheduler singleton.