28 #include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h" 29 #include "src/core/NEON/kernels/assembly/arm_gemm.hpp" 40 void operator()(
void *x)
57 Params extract_parameters(
const ITensor *a,
const ITensor *
b,
const ITensor *d,
const AsmGemmInfo &
info)
62 p.M = d->info()->tensor_shape().y();
63 p.K = a->info()->tensor_shape().x();
64 p.N = d->info()->tensor_shape().x();
73 p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3];
77 p.multis = b->info()->tensor_shape().z();
78 p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis;
82 if(info.depth_output_gemm3d != 0)
84 p.M = d->info()->tensor_shape().y() * d->info()->tensor_shape().z();
85 p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis;
101 switch(act.activation())
104 gemm_act.type = arm_gemm::Activation::Type::ReLU;
107 gemm_act.type = arm_gemm::Activation::Type::BoundedReLU;
108 gemm_act.param1 = act.a();
109 gemm_act.param2 = 0.f;
112 gemm_act.type = arm_gemm::Activation::Type::BoundedReLU;
113 gemm_act.param1 = act.a();
114 gemm_act.param2 = act.b();
123 IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method,
DataType data_type)
126 const int granule_threshold = 200;
127 IScheduler::Hints scheduling_hint = IScheduler::Hints(
Window::DimX);
128 if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type ==
DataType::F32)
143 return scheduling_hint;
146 template <
typename TypeInput,
typename TypeOutput>
147 class FallbackTransform :
public ITransformWeights
150 FallbackTransform() noexcept {};
152 FallbackTransform(
const FallbackTransform &) =
delete;
154 FallbackTransform(FallbackTransform &&) =
default;
156 FallbackTransform &operator=(
const FallbackTransform &) =
delete;
158 FallbackTransform &operator=(FallbackTransform &&) =
default;
161 _output.allocator()->allocate();
163 _gemm_kernel_asm->pretranspose_B_array(_output.buffer(), _in1_ptr, _ldb, _multi_stride_b);
167 void release()
override 169 _output.allocator()->free();
172 ITensor *get_weights()
override 177 uint32_t uid()
override 179 uint32_t
id = (_B_pretranspose_size | 0x80000000);
183 void configure(
size_t B_pretranspose_size,
unsigned int alignment)
185 _output.allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment ) }, 1,
DataType::S8), alignment);
186 _B_pretranspose_size = B_pretranspose_size;
189 void set_pretranspose(ITensor *tensor)
193 _gemm_kernel_asm->set_pretransposed_B_data(tensor->buffer());
197 void set_args(
const int ldb,
const TypeInput *in1_ptr,
const int multi_stride_b, std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> gemm_kernel_asm)
201 _multi_stride_b = multi_stride_b;
202 _gemm_kernel_asm = gemm_kernel_asm;
208 const TypeInput *_in1_ptr{};
209 int _multi_stride_b{};
210 size_t _B_pretranspose_size{};
211 std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{
nullptr };
215 template <
typename TypeInput,
typename TypeOutput,
class OutputStage = arm_gemm::Nothing>
216 class Fallback :
public NEGEMMAssemblyDispatch::IFallback
223 if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b)))
225 delete _pretranspose;
241 void configure(
const ITensor *a,
const ITensor *
b,
const ITensor *c, ITensor *d,
242 arm_gemm::GemmArgs
args,
const AsmGemmInfo &gemm_info,
243 MemoryGroup &memory_group, IWeightsManager *weights_manager,
const OutputStage &os = {});
259 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(
const std::vector<int32_t> &shifts,
260 const std::vector<int32_t> &multipliers);
264 void prepare()
override;
265 bool is_configured()
const override;
274 void allocate_workspace(
size_t workspace_size, MemoryGroup &memory_group,
size_t alignment);
282 void configure_indirect(
const ITensorInfo *a,
const ITensorInfo *b,
const ITensorInfo *d,
const AsmGemmInfo &
info);
284 void prepare_indirect_buffer();
287 std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{
nullptr };
289 std::unique_ptr<INEKernel> _optimised_kernel{
nullptr };
305 ITensor *_d{
nullptr };
309 ITensor *_pretranspose{
nullptr };
311 bool _is_prepared{
false };
313 AsmGemmInfo _gemm_info{};
315 IWeightsManager *_weights_manager{
nullptr };
317 FallbackTransform<TypeInput, TypeOutput> _weights_transform{};
319 arm_gemm::KernelDescription _kernel_info{};
321 std::vector<int32_t> _shifts{};
322 std::vector<int32_t> right_shifts{};
323 std::vector<int32_t> left_shifts{};
325 std::vector<int32_t> _multipliers{};
327 std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
328 std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
329 std::vector<TypeInput> _indirect_pad{};
330 arm_gemm::ConvolutionParameters _cp{};
333 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
334 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
335 Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(
const std::vector<int32_t> &shifts,
const std::vector<int32_t> &multipliers)
337 _multipliers = multipliers;
339 bool need_left =
false;
340 for(
const auto s : _shifts)
342 left_shifts.push_back(std::max(-s, int32_t(0)));
343 right_shifts.push_back(std::min(-s, int32_t(0)));
344 if(s < 0 && !need_left)
349 return std::make_tuple(need_left, left_shifts.data(), right_shifts.data(), _multipliers.data());
352 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
353 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer()
355 const TypeInput *A_ptr =
reinterpret_cast<TypeInput *
>(_a->buffer());
357 const int batches = _a->info()->tensor_shape().total_size_upper(3);
358 const size_t stride_A = _a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
359 const size_t batch_stride_A = _a->info()->strides_in_bytes()[3] /
sizeof(TypeInput);
360 const size_t multi_stride_A = _a->info()->strides_in_bytes()[4] /
sizeof(TypeInput);
362 const size_t output_hw = _cp.output_height * _cp.output_width;
363 const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw *
sizeof(TypeInput);
364 const size_t batch_stride = batch_size /
sizeof(TypeInput);
365 const int multi_size = batch_size *
batches;
366 const size_t multi_stride = multi_size /
sizeof(TypeInput);
368 for(int64_t m = 0; m <
multis; m++)
372 for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
374 for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
376 int64_t output_xy = (output_y * _cp.output_width) + output_x;
378 for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
380 for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
382 int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
383 int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
384 int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
385 int64_t input_xy = (input_y * _cp.input_width) + input_x;
387 if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
389 _indirect_buf.get()[m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
393 _indirect_buf.get()[m * multi_stride +
b * batch_stride + kernel_xy * output_hw + output_xy] =
394 A_ptr + (m * multi_stride_A +
b * batch_stride_A + input_xy * stride_A);
404 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
405 void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(
const ITensorInfo *a,
const ITensorInfo *
b,
const ITensorInfo *d,
const AsmGemmInfo &
info)
412 zeropad = a->quantization_info().uniform().offset;
415 const int64_t
input_width =
static_cast<int64_t
>(a->tensor_shape()[1]);
416 const int64_t
input_height =
static_cast<int64_t
>(a->tensor_shape()[2]);
417 const int64_t input_channels =
static_cast<int64_t
>(a->tensor_shape()[0]);
418 const int64_t kernel_width =
static_cast<int64_t
>(b->tensor_shape()[2]);
419 const int64_t kernel_height =
static_cast<int64_t
>(b->tensor_shape()[3]);
420 const int64_t output_width =
static_cast<int64_t
>(d->tensor_shape()[1]);
421 const int64_t output_height =
static_cast<int64_t
>(d->tensor_shape()[2]);
424 info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
429 _gemm_kernel_asm->set_convolution_parameters(_cp);
434 const unsigned int multis = 1;
435 const unsigned int batches = a->tensor_shape().total_size_upper(3);
436 const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
437 const unsigned int output_hw = _cp.output_width * _cp.output_height;
439 using TypeInputPtr = TypeInput *;
440 const int batch_size = kernel_hw * output_hw *
sizeof(TypeInputPtr);
441 const size_t batch_stride = batch_size /
sizeof(TypeInputPtr);
442 const int multi_size = batch_size *
batches;
443 const size_t multi_stride = multi_size /
sizeof(TypeInputPtr);
445 _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
reinterpret_cast<const TypeInput **
>(malloc(multi_size * multis)));
446 _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
reinterpret_cast<const TypeInput *
const **
>(malloc(
sizeof(TypeInput **) * kernel_hw * multis * batches)));
447 _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
451 for(int64_t m = 0; m <
multis; m++)
453 for(int64_t b = 0; b <
batches; b++)
455 for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
457 (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
462 _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
466 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
467 void Fallback<TypeInput, TypeOutput, OutputStage>::configure(
const ITensor *a,
const ITensor *b,
const ITensor *c, ITensor *d,
468 arm_gemm::GemmArgs
args,
const AsmGemmInfo &gemm_info,
469 MemoryGroup &memory_group, IWeightsManager *weights_manager,
const OutputStage &os)
471 arm_gemm::GemmConfig gemm_cfg;
472 _kernel_info = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(
args, os);
473 _weights_manager = weights_manager;
474 if(_kernel_info.method != arm_gemm::GemmMethod::GEMV_BATCHED)
476 gemm_cfg.filter = _kernel_info.name;
477 args._cfg = &gemm_cfg;
479 _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(
args, os);
480 if(_gemm_kernel_asm ==
nullptr)
487 std::unique_ptr<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>> acl_gemm_wrapper = std::make_unique<NEGEMMAssemblyWrapperKernel<TypeInput, TypeOutput>>();
489 acl_gemm_wrapper->configure(_gemm_kernel_asm.get(), gemm_cfg.filter);
490 const size_t workspace_size = _gemm_kernel_asm->get_working_size();
491 if(workspace_size > 0)
494 const unsigned int alignment = 4096;
495 allocate_workspace(workspace_size, memory_group, alignment);
501 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
502 if(window_size < static_cast<unsigned int>(args._maxthreads))
504 _gemm_kernel_asm->set_nthreads(window_size);
508 _optimised_kernel = std::move(acl_gemm_wrapper);
513 _gemm_info = gemm_info;
515 if(_gemm_kernel_asm->B_pretranspose_required())
518 const unsigned int alignment = 128;
519 const size_t B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
520 if(weights_manager && _weights_manager->are_weights_managed(b))
522 _weights_transform.configure(B_pretranspose_size, alignment);
523 _pretranspose = _weights_manager->acquire(b, &_weights_transform);
527 _pretranspose =
new Tensor();
528 static_cast<Tensor *
>(_pretranspose)->
allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment ) }, 1,
DataType::S8), alignment);
535 configure_indirect(a->info(), b->info(), d->info(), gemm_info);
539 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
540 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
547 _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0);
551 if(_gemm_kernel_asm->B_pretranspose_required())
553 const int ldb = _b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
554 const auto in1_ptr =
reinterpret_cast<const TypeInput *
>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
555 const int multi_stride_b = _b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
557 if(_weights_manager && _weights_manager->are_weights_managed(_b))
559 _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
560 _weights_manager->run(_b, &_weights_transform);
563 if(!_weights_transform.is_reshape_run())
565 _weights_transform.set_pretranspose(_pretranspose);
570 static_cast<Tensor *
>(_pretranspose)->
allocator()->allocate();
572 _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
573 _b->mark_as_unused();
579 prepare_indirect_buffer();
586 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
587 void Fallback<TypeInput, TypeOutput, OutputStage>::allocate_workspace(
size_t workspace_size, MemoryGroup &memory_group,
size_t alignment)
590 _workspace.allocator()->init(TensorInfo(TensorShape{ (workspace_size + alignment ) }, 1,
DataType::S8), alignment);
591 memory_group.manage(&_workspace);
592 _workspace.allocator()->allocate();
595 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
596 bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured()
const 598 return _optimised_kernel !=
nullptr;
601 template <
typename TypeInput,
typename TypeOutput,
class OutputStage>
604 int lda = _a->info()->strides_in_bytes().y() /
sizeof(TypeInput);
606 const int ldd = _d->info()->strides_in_bytes().y() /
sizeof(TypeOutput);
608 const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
609 const size_t a_multi_idx = a_batch_idx + 1;
610 const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
611 const size_t d_multi_idx = d_batch_idx + 1;
613 int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] /
sizeof(TypeInput);
614 const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] /
sizeof(TypeOutput);
616 int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] /
sizeof(TypeInput);
617 int multi_stride_b = 0;
618 const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] /
sizeof(TypeOutput);
620 auto in0_ptr =
reinterpret_cast<const TypeInput *
>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
621 const TypeInput *in1_ptr =
nullptr;
622 auto out_ptr =
reinterpret_cast<TypeOutput *
>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
625 if(!_gemm_kernel_asm->B_is_pretransposed())
627 ldb = _b->info()->strides_in_bytes().y() /
sizeof(TypeInput);
628 multi_stride_b = _b->info()->strides_in_bytes().z() /
sizeof(TypeInput);
629 in1_ptr =
reinterpret_cast<const TypeInput *
>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
632 const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type());
635 if(_workspace.buffer() !=
nullptr)
637 _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
638 const unsigned int split_dim = scheduling_hint.split_dimension();
639 const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
641 if(window_size < num_threads)
643 num_threads = window_size;
648 const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
649 num_threads = std::min(num_iterations, num_threads);
651 _gemm_kernel_asm->set_nthreads(num_threads);
658 TypeOutput *bias =
nullptr;
661 bias =
reinterpret_cast<TypeOutput *
>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
673 _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
674 in1_ptr, ldb, multi_stride_b,
675 out_ptr, ldd, batch_stride_d, multi_stride_d,
681 template <
typename TypeInput,
typename TypeOutput>
682 void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &
arm_gemm, MemoryGroup &memory_group,
683 const ITensor *a,
const ITensor *b,
const ITensor *c, ITensor *d,
arm_gemm::Activation activation,
const AsmGemmInfo &info,
684 IWeightsManager *weights_manager)
686 Params p = extract_parameters(a, b, d, info);
690 arm_gemm::GemmArgs
args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
693 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
694 fallback->configure(a, b, c, d, args, info, memory_group, weights_manager);
695 arm_gemm = std::move(fallback);
698 template <
typename TypeInput,
typename TypeOutput>
699 void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
700 const ITensor *a,
const ITensor *b,
const ITensor *c, ITensor *d,
arm_gemm::Activation activation,
const AsmGemmInfo &info,
701 IWeightsManager *weights_manager)
704 Params p = extract_parameters(a, b, d, info);
708 arm_gemm::GemmArgs
args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
711 auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
714 const int32_t negation = info.negated_offsets ? 1 : -1;
715 const int32_t a_offset = -a->info()->quantization_info().uniform().offset * negation;
716 const int32_t b_offset = -b->info()->quantization_info().uniform().offset * negation;
717 const GEMMLowpOutputStageInfo os_info = info.output_stage;
719 arm_gemm::Requantize32 gemm_requant_info{};
720 if(os_info.gemmlowp_shifts.size() > 1)
722 const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
723 gemm_requant_info = arm_gemm::Requantize32(
nullptr, 0,
724 a_offset, b_offset, os_info.gemmlowp_offset,
725 (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) :
nullptr,
726 std::get<2>(requantize_data),
727 std::get<3>(requantize_data),
728 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
732 gemm_requant_info = arm_gemm::Requantize32(
nullptr, 0,
733 a_offset, b_offset, os_info.gemmlowp_offset,
734 -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
735 os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
739 fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info);
740 arm_gemm = std::move(fallback);
746 : _arm_gemm(nullptr), _memory_group(
std::move(memory_manager)), _weights_manager(weights_manager)
801 create_arm_gemm<float, float>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
808 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
812 create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
819 create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
823 create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
827 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) 829 create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
832 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 834 create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a,
b, c, d, act,
info, _weights_manager);
845 _arm_gemm->prepare();
850 return _arm_gemm !=
nullptr && _arm_gemm->is_configured();
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
NEGEMMAssemblyDispatch(std::shared_ptr< IMemoryManager > memory_manager=nullptr, IWeightsManager *weights_manager=nullptr)
Constructor.
ActivationLayerInfo activation_info
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
1 channel, 1 U8 per channel
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
const size_t input_height
Split the workload evenly among the threads.
static constexpr unsigned int split_dimensions_all
When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value then the schedul...
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
CPUInfo & cpu_info()
Get CPU info.
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
void run() override
Run the kernels contained in the function.
Activation Layer Information class.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
Split the workload dynamically using a bucket system.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
16-bit brain floating-point number
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
static bool is_activation_supported(const ActivationLayerInfo &activation)
Checks if activation is supported by the gemm assembly dispatcher.
void prepare() override
Prepare the function for executing.
1 channel, 1 U32 per channel
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
quantized, asymmetric fixed-point 8-bit number unsigned
bool is_configured() const
Was the function successfully configured ?
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
Indicates whether or not this function can be used to process the given parameters.
Weights manager interface to handle weights transformations.
input allocator() -> allocate()
quantized, symmetric per channel fixed-point 8-bit number
Lower and Upper Bounded Rectifier ( )
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Memory group resources scope handling class.
Upper Bounded Rectifier ( )
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
If supported create a Compute Library function else fallback to the arm_gemm function.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
quantized, asymmetric fixed-point 8-bit number signed
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.
DataType
Available data types.
static IScheduler & get()
Access the scheduler singleton.