51 if(output->total_size() > 0)
60 void matrix_addition_f32(
const ITensor *input, ITensor *output,
const Window &window,
float beta)
62 const float32x4_t beta_f32 = vdupq_n_f32(beta);
64 Iterator in(input, window);
65 Iterator out(output, window);
69 const auto in_ptr =
reinterpret_cast<const float *
>(in.ptr());
70 const auto out_ptr =
reinterpret_cast<float *
>(out.ptr());
72 float32x4x4_t alpha_ab = vld4q_f32(out_ptr);
73 const float32x4x4_t c = vld4q_f32(in_ptr);
76 alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
77 alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
78 alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
79 alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
81 vst4q_f32(out_ptr, alpha_ab);
86 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 87 void matrix_addition_f16(
const ITensor *input, ITensor *output,
const Window &window,
float beta)
89 const float16x8_t beta_f16 = vdupq_n_f16(beta);
91 Iterator in(input, window);
92 Iterator out(output, window);
96 const auto in_ptr =
reinterpret_cast<const float16_t *
>(in.ptr());
97 const auto out_ptr =
reinterpret_cast<float16_t *
>(out.ptr());
99 float16x8x2_t alpha_ab = vld2q_f16(out_ptr);
100 const float16x8x2_t c = vld2q_f16(in_ptr);
105 vst2q_f16(out_ptr + 0, alpha_ab);
128 _func = &matrix_addition_f32;
131 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 132 _func = &matrix_addition_f16;
141 INESimpleKernel::configure(input, output, num_elems_processed_per_iteration);
161 (*_func)(_input, _output,
window, _beta);
const Window & window() const
The maximum window the kernel can be executed on.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
Store the tensor's metadata.
float16x8_t vaddq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Interface for simple C++ kernels having 1 tensor input and 1 tensor output.
Interface for Neon tensor.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta)
Static function to check if given info will lead to a valid configuration of NEGEMMMatrixAdditionKern...
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
NEGEMMMatrixAdditionKernel()
Constructor.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void configure(const ITensor *input, ITensor *output, float beta)
Initialise the kernel's input and output.
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)