39 const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
41 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 44 inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t
input)
46 const float16x8x2_t out =
57 inline uint8x16_t convert_f16x8x2_to_u8x16(
const float16x8x2_t &input)
63 inline float16x8x2_t vector_accumulate_weighted(
const float16x8x2_t &vec0,
const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2)
65 const float16x8x2_t res =
76 void acc_we_v16_u8(
const void *__restrict input,
void *__restrict accum, float16x8_t scale_val, float16x8_t scale_val2)
81 const auto input_ptr =
static_cast<const uint8_t *__restrict
>(
input);
82 const auto accum_ptr =
static_cast<uint8_t *__restrict
>(accum);
84 const uint8x16x4_t input_buffer = vld4q_u8(input_ptr);
85 uint8x16x4_t accum_buffer = vld4q_u8(accum_ptr);
87 const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]);
88 const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]);
89 const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]);
90 const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]);
92 float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]);
93 float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]);
94 float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]);
95 float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]);
97 f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2);
98 f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2);
99 f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2);
100 f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2);
103 convert_f16x8x2_to_u8x16(f16_accum_0),
104 convert_f16x8x2_to_u8x16(f16_accum_1),
105 convert_f16x8x2_to_u8x16(f16_accum_2),
106 convert_f16x8x2_to_u8x16(f16_accum_3)
110 vst4q_u8(accum_ptr, accum_buffer);
120 Iterator
input(_input, window);
121 Iterator accum(_output, window);
123 const float16x8_t scale_val = vdupq_n_f16(1.f - _alpha);
124 const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
128 fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
136 inline void acc_v16_u8(
const void *__restrict input,
void *__restrict accum)
141 const auto in =
static_cast<const uint8_t *__restrict
>(
input);
142 const auto out =
static_cast<int16_t *__restrict
>(accum);
144 uint8x16_t ta1 = vld1q_u8(in);
145 int16x8_t ta2 = vld1q_s16(out);
146 int16x8_t ta3 = vld1q_s16(out + 8);
148 ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1))));
149 ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1))));
152 vst1q_s16(out + 8, ta3);
155 inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input)
157 const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input));
158 const uint16x8_t u16_output_hi = vmovl_u8(vget_high_u8(input));
160 const float32x4x4_t res =
163 vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))),
164 vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))),
165 vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))),
166 vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi)))
173 inline uint8x16_t convert_f32x4x4_to_u8x16(
const float32x4x4_t &input)
175 return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])),
176 vmovn_u32(vcvtq_u32_f32(input.val[1])))),
177 vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])),
178 vmovn_u32(vcvtq_u32_f32(input.val[3])))));
181 inline float32x4x4_t vector_accumulate_weighted(
const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2)
183 vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val);
184 vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val);
185 vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val);
186 vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val);
188 vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2);
189 vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2);
190 vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2);
191 vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2);
193 return vector_output;
196 inline void acc_we_v16_u8(
const void *__restrict input,
void *__restrict accum,
const float32x4_t scale_val,
const float32x4_t scale_val2)
201 const auto input_ptr =
static_cast<const uint8_t *__restrict
>(
input);
202 const auto accum_ptr =
static_cast<uint8_t *__restrict
>(accum);
204 const uint8x16_t input_buffer = vld1q_u8(input_ptr);
205 const uint8x16_t accum_buffer = vld1q_u8(accum_ptr);
207 const float32x4x4_t f32_input_0 = convert_u8x16_to_f32x4x4(input_buffer);
208 const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer);
210 const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2);
212 vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0));
215 void acc_sq_v16_u8(
const void *__restrict input, uint32_t shift,
void *__restrict accum)
221 const auto input_buffer =
static_cast<const uint8_t *__restrict
>(
input);
222 const auto accum_buffer =
static_cast<int16_t *__restrict
>(accum);
224 const uint8x16_t ta1 = vld1q_u8(input_buffer);
225 uint16x8_t ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer));
226 uint16x8_t ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8));
228 const int16x8_t vector_shift = vdupq_n_s16(-static_cast<int16_t>(shift));
230 uint16x8_t linput = vmovl_u8(vget_low_u8(ta1));
231 uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1));
233 linput = vmulq_u16(linput, linput);
234 hinput = vmulq_u16(hinput, hinput);
236 linput = vqshlq_u16(linput, vector_shift);
237 hinput = vqshlq_u16(hinput, vector_shift);
239 ta2 = vqaddq_u16(ta2, linput);
240 ta3 = vqaddq_u16(ta3, hinput);
242 vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2)));
243 vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3)));
260 INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
273 acc_v16_u8(input.
ptr(), accum.
ptr());
299 INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
311 const float32x4_t scale_val = vdupq_n_f32(1.f - _alpha);
312 const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
316 acc_we_v16_u8(input.
ptr(), accum.
ptr(), scale_val, scale_val2);
342 INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
355 acc_sq_v16_u8(input.
ptr(), _shift, accum.
ptr());
bool set_format_if_unknown(ITensorInfo &info, Format format)
Set the format, data type and number of channels to the specified value if the current data type is u...
const Window & window() const
The maximum window the kernel can be executed on.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
uint16x8_t vcvtq_u16_f16(float16x8_t)
1 channel, 1 U8 per channel
NEAccumulateWeightedKernel()
Default constructor.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
#define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...)
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
float16x8_t vcvtq_f16_u16(uint16x8_t)
float16x8_t vfmaq_f16(float16x8_t, float16x8_t, float16x8_t)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void configure(const ITensor *input, ITensor *accum)
Set the input and accumulation tensors.
void configure(const ITensor *input, float alpha, ITensor *accum)
Set the input and accumulation tensors, and the scale value.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
NEAccumulateSquaredKernel()
Default constructor.
Iterator updated by execute_window_loop for each window element.
void configure(const ITensor *input, uint32_t shift, ITensor *accum)
Set the input and accumulation tensors and the shift value.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)