46 void operator()(
void *x)
63 Params extract_parameters(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
67 p.M = d->tensor_shape().y();
68 p.K = a->tensor_shape().x();
69 p.N = d->tensor_shape().x();
78 p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
82 p.multis = b->tensor_shape().z();
83 p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
87 if(info.depth_output_gemm3d != 0)
89 p.M = d->tensor_shape().y() * d->tensor_shape().z();
90 p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
99 const int granule_threshold = 200;
100 IScheduler::Hints scheduling_hint = IScheduler::Hints(
Window::DimX);
116 return scheduling_hint;
120 template <
typename TypeInput,
typename TypeOutput,
class OutputStage = arm_gemm::Nothing>
121 class Fallback :
public CpuGemmAssemblyDispatch::IFallback
125 ~Fallback() =
default;
137 void configure(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *c, ITensorInfo *d,
139 const OutputStage &os = {});
155 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(
const std::vector<int32_t> &shifts,
156 const std::vector<int32_t> &multipliers);
159 void run(ITensorPack &tensors)
override;
160 void prepare(ITensorPack &tensors)
override;
161 bool is_configured()
const override;
167 AsmGemmWorkspace = 0,
179 void configure_indirect(
const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *d,
const AsmGemmInfo &
info);
181 void prepare_indirect_buffer(ITensorPack &tensors);
184 std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{
nullptr };
186 std::unique_ptr<INEKernel> _optimised_kernel{
nullptr };
188 TensorInfo _workspace_info{};
190 TensorInfo _pretranspose_info{};
192 bool _is_prepared{
false };
194 AsmGemmInfo _gemm_info{};
198 std::vector<int32_t> _shifts{};
199 std::vector<int32_t> right_shifts{};
200 std::vector<int32_t> left_shifts{};
202 std::vector<int32_t> _multipliers{};
204 std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
205 std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
206 std::vector<TypeInput> _indirect_pad{};
211 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
212 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
213 Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(
const std::vector<int32_t> &shifts,
const std::vector<int32_t> &multipliers)
215 _multipliers = multipliers;
217 bool need_left =
false;
218 for(
const auto s : _shifts)
220 left_shifts.push_back(std::max(-s, int32_t(0)));
221 right_shifts.push_back(std::min(-s, int32_t(0)));
222 if(s < 0 && !need_left)
227 return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
230 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
231 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
234 const TypeInput *A_ptr =
reinterpret_cast<TypeInput *
>(a->buffer());
236 const int batches = a->info()->tensor_shape().total_size_upper(3);
237 const size_t stride_A = a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
238 const size_t batch_stride_A = a->info()->strides_in_bytes()[3] /
sizeof(TypeInput);
239 const size_t multi_stride_A = a->info()->strides_in_bytes()[4] /
sizeof(TypeInput);
243 const size_t batch_stride = batch_size /
sizeof(TypeInput);
244 const int multi_size = batch_size *
batches;
245 const size_t multi_stride = multi_size /
sizeof(TypeInput);
247 for(int64_t m = 0; m <
multis; m++)
251 for(int64_t output_y = 0; output_y < _cp.
output_height; output_y++)
253 for(int64_t output_x = 0; output_x < _cp.
output_width; output_x++)
255 int64_t output_xy = (output_y * _cp.
output_width) + output_x;
257 for(int64_t kernel_y = 0; kernel_y < _cp.
kernel_height; kernel_y++)
259 for(int64_t kernel_x = 0; kernel_x < _cp.
kernel_width; kernel_x++)
263 int64_t kernel_xy = (kernel_y * _cp.
kernel_width) + kernel_x;
264 int64_t input_xy = (input_y * _cp.
input_width) + input_x;
268 _indirect_buf.get()[m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
272 _indirect_buf.get()[m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] =
273 A_ptr + (m * multi_stride_A +
b * batch_stride_A + input_xy * stride_A);
283 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
284 void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
291 zeropad = a->quantization_info().uniform().offset;
294 const int64_t
input_width =
static_cast<int64_t
>(a->tensor_shape()[1]);
295 const int64_t
input_height =
static_cast<int64_t
>(a->tensor_shape()[2]);
296 const int64_t input_channels =
static_cast<int64_t
>(a->tensor_shape()[0]);
297 const int64_t kernel_width =
static_cast<int64_t
>(b->tensor_shape()[2]);
298 const int64_t kernel_height =
static_cast<int64_t
>(b->tensor_shape()[3]);
299 const int64_t output_width =
static_cast<int64_t
>(d->tensor_shape()[1]);
300 const int64_t output_height =
static_cast<int64_t
>(d->tensor_shape()[2]);
303 info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
308 _gemm_kernel_asm->set_convolution_parameters(_cp);
313 const unsigned int multis = 1;
314 const unsigned int batches = a->tensor_shape().total_size_upper(3);
318 using TypeInputPtr = TypeInput *;
319 const int batch_size = kernel_hw * output_hw *
sizeof(TypeInputPtr);
320 const size_t batch_stride = batch_size /
sizeof(TypeInputPtr);
321 const int multi_size = batch_size *
batches;
322 const size_t multi_stride = multi_size /
sizeof(TypeInputPtr);
324 _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
reinterpret_cast<const TypeInput **
>(malloc(multi_size * multis)));
325 _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
reinterpret_cast<const TypeInput *
const **
>(malloc(
sizeof(TypeInput **) * kernel_hw * multis * batches)));
326 _indirect_pad = std::vector<TypeInput>(_cp.
input_channels, TypeInput(zeropad));
330 for(int64_t m = 0; m <
multis; m++)
332 for(int64_t b = 0; b <
batches; b++)
334 for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
336 (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
341 _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
345 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
348 const OutputStage &os)
352 _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(
args, os);
356 args._cfg = &gemm_cfg;
358 _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(
args, os);
359 if(_gemm_kernel_asm ==
nullptr)
366 auto acl_gemm_wrapper = std::make_unique<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeOutput>>();
368 acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.
filter);
369 const size_t workspace_size = _gemm_kernel_asm->get_working_size();
370 const unsigned int alignment = 4096;
371 _workspace_info = TensorInfo(TensorShape(workspace_size), 1,
DataType::U8);
372 _aux_mem[AsmGemmWorkspace] =
MemoryInfo(
offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
377 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
378 if(window_size < static_cast<unsigned int>(args._maxthreads))
380 _gemm_kernel_asm->set_nthreads(window_size);
384 _optimised_kernel = std::move(acl_gemm_wrapper);
385 _gemm_info = gemm_info;
387 if(_gemm_kernel_asm->B_pretranspose_required())
390 const unsigned int alignment = 128;
391 const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
392 _pretranspose_info = TensorInfo(TensorShape(B_pretranspose_size), 1,
DataType::U8);
393 _aux_mem[Pretranspose] =
MemoryInfo(
offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
399 configure_indirect(a, b, d, gemm_info);
403 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
404 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
414 _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
418 if(_gemm_kernel_asm->B_pretranspose_required())
420 const int ldb = b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
421 const auto in1_ptr =
reinterpret_cast<const TypeInput *
>(b->buffer() + b->info()->offset_first_element_in_bytes());
422 const int multi_stride_b = b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
424 CpuAuxTensorHandler pretranspose(
offset_int_vec(Pretranspose), _pretranspose_info, tensors,
false);
426 _gemm_kernel_asm->pretranspose_B_array(pretranspose.get()->buffer(), in1_ptr, ldb, multi_stride_b);
433 prepare_indirect_buffer(tensors);
440 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
441 bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured()
const 443 return _optimised_kernel !=
nullptr;
446 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
452 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
460 int lda = a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
462 const int ldd = d->info()->strides_in_bytes().y() /
sizeof(TypeOutput);
464 const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
465 const size_t a_multi_idx = a_batch_idx + 1;
466 const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
467 const size_t d_multi_idx = d_batch_idx + 1;
469 int batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] /
sizeof(TypeInput);
470 const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] /
sizeof(TypeOutput);
472 int multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] /
sizeof(TypeInput);
473 int multi_stride_b = 0;
474 const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] /
sizeof(TypeOutput);
476 auto in0_ptr =
reinterpret_cast<const TypeInput *
>(a->buffer() + a->info()->offset_first_element_in_bytes());
477 const TypeInput *in1_ptr =
nullptr;
478 auto out_ptr =
reinterpret_cast<TypeOutput *
>(d->buffer() + d->info()->offset_first_element_in_bytes());
481 if(!_gemm_kernel_asm->B_is_pretransposed())
483 ldb = b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
484 multi_stride_b = b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
485 in1_ptr =
reinterpret_cast<const TypeInput *
>(b->buffer() + b->info()->offset_first_element_in_bytes());
488 const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.
method, d->info()->data_type());
491 CpuAuxTensorHandler workspace(
offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors,
false);
492 if(workspace.get()->buffer() !=
nullptr)
494 _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
495 const unsigned int split_dim = scheduling_hint.split_dimension();
496 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
498 if(window_size < num_threads)
500 num_threads = window_size;
505 const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
506 num_threads = std::min(num_iterations, num_threads);
508 _gemm_kernel_asm->set_nthreads(num_threads);
515 TypeOutput *bias =
nullptr;
518 bias =
reinterpret_cast<TypeOutput *
>(c->buffer() + c->info()->offset_first_element_in_bytes());
530 _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
531 in1_ptr, ldb, multi_stride_b,
532 out_ptr, ldd, batch_stride_d, multi_stride_d,
538 template <
typename TypeInput,
typename TypeOutput>
539 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm,
540 const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *c, ITensorInfo *d,
543 Params p = extract_parameters(a, b, d, info);
547 arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
550 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
551 fallback->configure(a, b, c, d, args, info);
552 arm_gemm = std::move(fallback);
555 template <
typename TypeInput,
typename TypeOutput>
556 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
557 const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *c, ITensorInfo *d,
561 Params p = extract_parameters(a, b, d, info);
565 arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode);
568 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
571 const int32_t negation = info.negated_offsets ? 1 : -1;
572 const int32_t a_offset = -a->quantization_info().uniform().offset * negation;
573 const int32_t b_offset = -b->quantization_info().uniform().offset * negation;
574 const GEMMLowpOutputStageInfo os_info = info.output_stage;
577 if(os_info.gemmlowp_shifts.size() > 1)
579 const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
581 a_offset, b_offset, os_info.gemmlowp_offset,
582 (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) :
nullptr,
583 std::get<2>(requantize_data),
584 std::get<3>(requantize_data),
585 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
590 a_offset, b_offset, os_info.gemmlowp_offset,
591 -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
592 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
596 fallback->configure(a, b, c, d, args, info, gemm_requant_info);
597 arm_gemm = std::move(fallback);
657 create_arm_gemm<float, float>(_arm_gemm, a,
b, c, d, act,
info);
664 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a,
b, c, d, act,
info);
668 create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, a,
b, c, d, act,
info);
675 create_arm_gemm<int8_t, int32_t>(_arm_gemm, a,
b, c, d, act,
info);
679 create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a,
b, c, d, act,
info);
683 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 685 create_arm_gemm<bfloat16, float>(_arm_gemm, a,
b, c, d, act,
info);
688 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 690 create_arm_gemm<float16_t, float16_t>(_arm_gemm, a,
b, c, d, act,
info);
701 _arm_gemm->prepare(tensors);
706 return _arm_gemm !=
nullptr && _arm_gemm->is_configured();
712 _arm_gemm->run(tensors);
718 return _arm_gemm->workspace();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
ActivationLayerInfo activation_info
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
1 channel, 1 U8 per channel
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Split the workload evenly among the threads.
static constexpr unsigned int split_dimensions_all
When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value then the schedul...
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
CPUInfo & cpu_info()
Get CPU info.
Activation Layer Information class.
Copyright (c) 2017-2021 Arm Limited.
std::vector< MemoryInfo > MemoryRequirements
1 channel, 1 F16 per channel
Split the workload dynamically using a bucket system.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
16-bit brain floating-point number
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not this function can be used to process the given parameters.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
1 channel, 1 U32 per channel
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
If supported create a Compute Library function else fallback to the arm_gemm function.
quantized, asymmetric fixed-point 8-bit number unsigned
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
bool is_configured() const
Was the function successfully configured ?
void prepare(ITensorPack &tensors) override
Prepare the function for executing.
quantized, symmetric per channel fixed-point 8-bit number
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
const size_t input_height
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
int offset_int_vec(int offset)
quantized, asymmetric fixed-point 8-bit number signed
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
Performs a mapping between Compute Library ActivationLayerInfo and the assembly Activation structure...
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.
im2col_func configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias)
CpuGemmAssemblyDispatch()
Constructor.
DataType
Available data types.
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
static IScheduler & get()
Access the scheduler singleton.