36 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 44 const float scale255_constant = 1.f / 255.f;
45 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
46 const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
67 if(output->total_size() > 0)
74 !(input1->data_type() == input2->data_type() && input2->data_type() == output->data_type()) &&
80 ,
"Invalid data type combination");
85 if(std::abs(scale - scale255_constant) < 0.00001f)
89 "Scale == 1/255 is not supported if input and output are of data type S32");
96 const float normalized_mantissa = std::frexp(scale, &exponent);
101 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
"Scale value not supported (Should be 1/(2^n) or 1/255");
114 inline int32x4_t scale255_S32_S32(int32x4_t in)
117 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
121 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
124 inline uint16x8_t scale255_U16_U16(uint16x8_t in)
126 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
127 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
128 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
131 template <
typename T>
132 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>
::type 133 vquantize(float32x4x4_t val,
const UniformQuantizationInfo &
info)
138 template <
typename T>
139 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>
::type 140 vquantize(float32x4x4_t val,
const UniformQuantizationInfo &
info)
145 template <
typename T>
146 void mul_saturate_quantized_8(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
float scale)
150 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
151 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
156 const int window_step_x = 16 /
sizeof(T);
157 const auto window_start_x =
static_cast<int>(window.x().start());
158 const auto window_end_x =
static_cast<int>(window.x().end());
159 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
161 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
162 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale /
scale, output_qua_info.offset };
164 if(is_broadcast_across_x)
166 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
167 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
168 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
169 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
170 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
171 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
172 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
175 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
177 Iterator broadcast_input(broadcast_tensor, broadcast_win);
178 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
179 Iterator output(out, win);
181 using ExactTagType =
typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
185 const auto non_broadcast_input_ptr =
reinterpret_cast<const T *
>(non_broadcast_input.ptr());
186 const auto output_ptr =
reinterpret_cast<T *
>(output.ptr());
188 const auto broadcast_value = *
reinterpret_cast<const T *
>(broadcast_input.ptr());
189 const auto broadcast_value_vec =
wrapper::vdup_n(broadcast_value, ExactTagType{});
192 int x = window_start_x;
193 for(; x <= (window_end_x - window_step_x); x += window_step_x)
195 const auto non_broadcast_v =
wrapper::vloadq(non_broadcast_input_ptr + x);
198 const float32x4x4_t in1_f32x4x4 =
vdequantize(non_broadcast_v, non_broadcast_qinfo);
199 const float32x4x4_t in2_f32x4x4 =
vdequantize(broadcast_value_vec, broadcast_qinfo);
201 const float32x4x4_t out_f32x4x4 =
203 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
204 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
205 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
206 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
210 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
215 for(; x < window_end_x; ++x)
218 const T in1 = *(non_broadcast_input_ptr + x);
221 const float tmp_f = tmp_in1 * tmp_in2;
225 *(output_ptr + x) = tmp_qua;
228 broadcast_input, non_broadcast_input, output);
232 const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
233 const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
236 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
237 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
239 Iterator input1(in1, input1_win);
240 Iterator input2(in2, input2_win);
241 Iterator output(out, win);
245 const auto input1_ptr =
reinterpret_cast<const T *
>(input1.ptr());
246 const auto input2_ptr =
reinterpret_cast<const T *
>(input2.ptr());
247 const auto output_ptr =
reinterpret_cast<T *
>(output.ptr());
250 int x = window_start_x;
251 for(; x <= (window_end_x - window_step_x); x += window_step_x)
257 const float32x4x4_t in1_f32x4x4 =
vdequantize(input1_q, input1_qua_info);
258 const float32x4x4_t in2_f32x4x4 =
vdequantize(input2_q, input2_qua_info);
260 const float32x4x4_t out_f32x4x4 =
262 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
263 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
264 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
265 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
269 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
274 for(; x < window_end_x; ++x)
277 const T in1 = *(input1_ptr + x);
278 const T in2 = *(input2_ptr + x);
281 const float tmp_f = tmp_in1 * tmp_in2;
285 *(output_ptr + x) = tmp_qua;
288 input1, input2, output);
292 void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
float scale)
294 const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
295 const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
296 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
300 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
301 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
305 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
306 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
308 Iterator input1(in1, input1_win);
309 Iterator input2(in2, input2_win);
310 Iterator output(out, win);
312 const int window_step_x = 16;
313 const auto window_start_x =
static_cast<int>(window.x().start());
314 const auto window_end_x =
static_cast<int>(window.x().end());
316 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale /
scale, output_qua_info.offset };
320 const auto input1_ptr =
reinterpret_cast<const qsymm16_t *
>(input1.ptr());
321 const auto input2_ptr =
reinterpret_cast<const qsymm16_t *
>(input2.ptr());
322 const auto output_ptr =
reinterpret_cast<qsymm16_t *
>(output.ptr());
325 int x = window_start_x;
326 for(; x <= (window_end_x - window_step_x); x += window_step_x)
331 vld1q_s16(input1_ptr + x),
332 vld1q_s16(input1_ptr + x + 8),
338 vld1q_s16(input2_ptr + x),
339 vld1q_s16(input2_ptr + x + 8),
344 const float32x4x4_t in1_f32x4x4 =
vdequantize(input1_q, input1_qua_info);
345 const float32x4x4_t in2_f32x4x4 =
vdequantize(input2_q, input2_qua_info);
347 const float32x4x4_t out_f32x4x4 =
349 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
350 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
351 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
352 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
356 vst1q_s16(output_ptr + x, result.val[0]);
357 vst1q_s16(output_ptr + x + 8, result.val[1]);
361 for(; x < window_end_x; ++x)
364 float tmp_in1 =
static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
365 float tmp_in2 =
static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
366 float tmp_f = tmp_in1 * tmp_in2;
369 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
370 qsymm16_t tmp_qua =
static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
371 *(output_ptr + x) = tmp_qua;
374 input1, input2, output);
377 void mul_QSYMM16_QSYMM16_S32(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int scale)
383 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
384 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
388 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
389 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
391 Iterator input1(in1, input1_win);
392 Iterator input2(in2, input2_win);
393 Iterator output(out, win);
395 const int window_step_x = 16;
396 const auto window_start_x =
static_cast<int>(window.x().start());
397 const auto window_end_x =
static_cast<int>(window.x().end());
401 const auto input1_ptr =
reinterpret_cast<const qsymm16_t *
>(input1.ptr());
402 const auto input2_ptr =
reinterpret_cast<const qsymm16_t *
>(input2.ptr());
403 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
406 int x = window_start_x;
407 for(; x <= (window_end_x - window_step_x); x += window_step_x)
412 vld1q_s16(input1_ptr + x),
413 vld1q_s16(input1_ptr + x + 8),
419 vld1q_s16(input2_ptr + x),
420 vld1q_s16(input2_ptr + x + 8),
424 const int32x4x4_t in1_s32 =
427 vmovl_s16(vget_low_s16(input1_q.val[0])),
428 vmovl_s16(vget_high_s16(input1_q.val[0])),
429 vmovl_s16(vget_low_s16(input1_q.val[1])),
430 vmovl_s16(vget_high_s16(input1_q.val[1])),
433 const int32x4x4_t in2_s32 =
436 vmovl_s16(vget_low_s16(input2_q.val[0])),
437 vmovl_s16(vget_high_s16(input2_q.val[0])),
438 vmovl_s16(vget_low_s16(input2_q.val[1])),
439 vmovl_s16(vget_high_s16(input2_q.val[1])),
443 const int32x4x4_t result =
446 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
447 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
448 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
449 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
453 vst1q_s32(output_ptr + x, result.val[0]);
454 vst1q_s32(output_ptr + x + 4, result.val[1]);
455 vst1q_s32(output_ptr + x + 8, result.val[2]);
456 vst1q_s32(output_ptr + x + 12, result.val[3]);
460 for(; x < window_end_x; ++x)
462 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
463 *(output_ptr + x) = tmp;
466 input1, input2, output);
469 template <
bool is_scale255,
bool is_sat>
470 void mul_U8_U8_U8(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int n)
474 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
475 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
479 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
480 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
482 Iterator input1(in1, input1_win);
483 Iterator input2(in2, input2_win);
484 Iterator output(out, win);
486 const int window_step_x = 16 /
sizeof(uint8_t);
487 const auto window_start_x =
static_cast<int>(window.x().start());
488 const auto window_end_x =
static_cast<int>(window.x().end());
492 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.ptr());
493 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
494 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.ptr());
497 int x = window_start_x;
498 for(; x <= (window_end_x - window_step_x); x += window_step_x)
503 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
504 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
505 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
506 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
508 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
509 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
513 tmp1_high = scale255_U16_U16(tmp1_high);
514 tmp1_low = scale255_U16_U16(tmp1_low);
518 const int16x8_t vn = vdupq_n_s16(-n);
522 tmp1_high = vqshlq_u16(tmp1_high, vn);
523 tmp1_low = vqshlq_u16(tmp1_low, vn);
527 tmp1_high = vshlq_u16(tmp1_high, vn);
528 tmp1_low = vshlq_u16(tmp1_low, vn);
533 vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
537 vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
542 for(; x < window_end_x; ++x)
544 uint16_t tmp =
static_cast<uint16_t
>(*(input1_ptr + x)) *
static_cast<uint16_t
>(*(input2_ptr + x));
548 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
549 tmp =
static_cast<uint16_t
>(tmp_f + 0.5f);
555 if(is_sat && tmp > 255)
559 *(output_ptr + x) = static_cast<uint8_t>(tmp);
562 input1, input2, output);
565 template <
bool is_scale255,
bool is_sat>
566 inline int16x8_t mul_S16_S16_S16_n_loop(
const int16x8_t &input1,
const int16x8_t &input2,
int n)
568 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
569 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
570 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));
571 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));
573 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
574 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
578 tmp1_high = scale255_S32_S32(tmp1_high);
579 tmp1_low = scale255_S32_S32(tmp1_low);
584 const int32x4_t vn = vdupq_n_s32(-n);
586 const int32x4_t vnl = vdupq_n_s32(n);
588 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
589 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
590 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
591 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
592 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
593 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
594 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
595 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
598 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
599 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
603 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
604 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
610 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
614 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
618 template <
bool is_scale255,
bool is_sat>
619 inline int16x8x2_t mul_S16_S16_S16_n_k(
const int16x8x2_t &input1,
const int16x8x2_t &input2,
int n)
621 const int16x8x2_t result =
625 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
627 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
634 template <
bool is_scale255,
bool is_sat>
635 void mul_S16_S16_S16(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int n)
639 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
640 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
644 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
645 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
647 Iterator input1(in1, input1_win);
648 Iterator input2(in2, input2_win);
649 Iterator output(out, win);
651 const int window_step_x = 16;
652 const auto window_start_x =
static_cast<int>(window.x().start());
653 const auto window_end_x =
static_cast<int>(window.x().end());
657 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.ptr());
658 const auto input2_ptr =
reinterpret_cast<const int16_t *
>(input2.ptr());
659 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.ptr());
662 int x = window_start_x;
663 for(; x <= (window_end_x - window_step_x); x += window_step_x)
665 const int16x8x2_t ta1 =
668 vld1q_s16(input1_ptr + x),
669 vld1q_s16(input1_ptr + x + 8),
672 const int16x8x2_t ta2 =
675 vld1q_s16(input2_ptr + x),
676 vld1q_s16(input2_ptr + x + 8),
679 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
681 vst1q_s16(output_ptr + x, result.val[0]);
682 vst1q_s16(output_ptr + x + 8, result.val[1]);
686 for(; x < window_end_x; ++x)
688 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
692 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
694 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
704 uint32_t mask = (1u << n) - 1;
705 tmp = (tmp +
static_cast<int32_t
>(mask)) >> n;
710 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
712 *(output_ptr + x) = static_cast<int16_t>(tmp);
715 input1, input2, output);
718 template <
bool is_sat>
719 inline int32x4_t mul_S32_S32_S32_n_loop(
const int32x4_t &input1,
const int32x4_t &input2,
int n)
721 const int32x2_t input1_1 = vget_low_s32(input1);
722 const int32x2_t input2_1 = vget_low_s32(input2);
723 const int32x2_t input1_2 = vget_high_s32(input1);
724 const int32x2_t input2_2 = vget_high_s32(input2);
726 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
727 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
731 const int64x2_t vn = vdupq_n_s64(-n);
733 const int64x2_t vnl = vdupq_n_s64(n);
735 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
736 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
737 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
738 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
740 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
741 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
742 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
743 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
746 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
747 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
748 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
752 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
753 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
754 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
758 template <
bool is_sat>
759 inline int32x4x2_t mul_S32_S32_S32_n_k(
const int32x4x2_t &input1,
const int32x4x2_t &input2,
int n)
761 const int32x4x2_t result =
765 mul_S32_S32_S32_n_loop<is_sat>(input1.val[0], input2.val[0], n),
767 mul_S32_S32_S32_n_loop<is_sat>(input1.val[1], input2.val[1], n)
774 template <
bool is_sat>
775 void mul_S32_S32_S32(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int n)
778 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
779 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
785 const int window_step_x = 8;
786 const auto window_start_x =
static_cast<int>(window.x().start());
787 const auto window_end_x =
static_cast<int>(window.x().end());
788 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
790 if(is_broadcast_across_x)
792 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
793 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
794 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
795 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
796 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
799 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
801 Iterator broadcast_input(broadcast_tensor, broadcast_win);
802 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
803 Iterator output(out, win);
807 const auto non_broadcast_input_ptr =
reinterpret_cast<const int32_t *
>(non_broadcast_input.ptr());
808 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
810 const int32_t broadcast_value = *
reinterpret_cast<const int32_t *
>(broadcast_input.ptr());
811 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
814 int x = window_start_x;
815 for(; x <= (window_end_x - window_step_x); x += window_step_x)
817 const int32x4x2_t broadcast_v =
824 const int32x4x2_t non_broadcast_v =
827 vld1q_s32(non_broadcast_input_ptr + x),
828 vld1q_s32(non_broadcast_input_ptr + x + 4),
831 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
833 vst1q_s32(output_ptr + x, result.val[0]);
834 vst1q_s32(output_ptr + x + 4, result.val[1]);
838 for(; x < window_end_x; ++x)
840 int64_t tmp =
static_cast<int64_t
>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
848 uint64_t mask = (1u << n) - 1;
849 tmp = (tmp +
static_cast<int64_t
>(mask)) >> n;
853 tmp = utility::clamp<int64_t, int32_t>(tmp);
855 *(output_ptr + x) = static_cast<int32_t>(tmp);
858 broadcast_input, non_broadcast_input, output);
863 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
864 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
866 Iterator input1(in1, input1_win);
867 Iterator input2(in2, input2_win);
868 Iterator output(out, win);
872 const auto input1_ptr =
reinterpret_cast<const int32_t *
>(input1.ptr());
873 const auto input2_ptr =
reinterpret_cast<const int32_t *
>(input2.ptr());
874 const auto output_ptr =
reinterpret_cast<int32_t *
>(output.ptr());
877 int x = window_start_x;
878 for(; x <= (window_end_x - window_step_x); x += window_step_x)
880 const int32x4x2_t ta1 =
883 vld1q_s32(input1_ptr + x),
884 vld1q_s32(input1_ptr + x + 4),
887 const int32x4x2_t ta2 =
890 vld1q_s32(input2_ptr + x),
891 vld1q_s32(input2_ptr + x + 4),
894 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
896 vst1q_s32(output_ptr + x, result.val[0]);
897 vst1q_s32(output_ptr + x + 4, result.val[1]);
901 for(; x < window_end_x; ++x)
903 int64_t tmp =
static_cast<int64_t
>(*(input1_ptr + x)) *
static_cast<int64_t
>(*(input2_ptr + x));
911 uint64_t mask = (1u << n) - 1;
912 tmp = (tmp +
static_cast<int64_t
>(mask)) >> n;
916 tmp = utility::clamp<int64_t, int32_t>(tmp);
918 *(output_ptr + x) = static_cast<int32_t>(tmp);
921 input1, input2, output);
925 void mul_F32_F32_F32(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
float scale)
928 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
929 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
935 constexpr
int window_step_x = 16 /
sizeof(float);
936 const auto window_start_x =
static_cast<int>(window.x().start());
937 const auto window_end_x =
static_cast<int>(window.x().end());
938 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
940 using ExactTagType =
typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
942 if(is_broadcast_across_x)
944 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
945 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
946 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
947 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
948 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
951 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
953 Iterator broadcast_input(broadcast_tensor, broadcast_win);
954 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
955 Iterator output(out, win);
959 const auto non_broadcast_input_ptr =
reinterpret_cast<const float *
>(non_broadcast_input.ptr());
960 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
962 const float broadcast_value = *
reinterpret_cast<const float *
>(broadcast_input.ptr());
963 const auto broadcast_value_vec =
wrapper::vdup_n(broadcast_value, ExactTagType{});
967 int x = window_start_x;
968 for(; x <= (window_end_x - window_step_x); x += window_step_x)
970 const auto non_broadcast_v =
wrapper::vloadq(non_broadcast_input_ptr + x);
976 for(; x < window_end_x; ++x)
978 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
979 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
982 broadcast_input, non_broadcast_input, output);
987 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
988 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
990 Iterator input1(in1, input1_win);
991 Iterator input2(in2, input2_win);
992 Iterator output(out, win);
996 const auto input1_ptr =
reinterpret_cast<const float *
>(input1.ptr());
997 const auto input2_ptr =
reinterpret_cast<const float *
>(input2.ptr());
998 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
1001 int x = window_start_x;
1002 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1012 for(; x < window_end_x; ++x)
1014 const auto ta1 = *(input1_ptr + x);
1015 const auto ta2 = *(input2_ptr + x);
1016 *(output_ptr + x) = ta1 * ta2 * scale;
1019 input1, input2, output);
1023 void c_mul_F32_F32_F32_n(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window)
1026 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1027 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1030 Window win = window;
1033 constexpr
int window_step_x = 8 /
sizeof(float);
1034 const auto window_start_x =
static_cast<int>(window.x().start());
1035 const auto window_end_x =
static_cast<int>(window.x().end());
1036 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
1038 using ExactTagType =
typename wrapper::traits::neon_vector<float, 2>::tag_type;
1040 if(is_broadcast_across_x)
1042 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1043 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1044 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1045 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
1046 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
1049 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1051 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1052 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1053 Iterator output(out, win);
1057 const auto non_broadcast_input_ptr =
reinterpret_cast<const float *
>(non_broadcast_input.ptr());
1058 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
1060 const float broadcast_value = *
reinterpret_cast<const float *
>(broadcast_input.ptr());
1063 int x = window_start_x;
1064 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1067 float32x4_t
b = vdupq_n_f32(broadcast_value);
1069 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1086 for(; x < window_end_x; ++x)
1088 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1089 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1090 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1091 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1092 *(output_ptr + 2 * x) = res1;
1093 *(output_ptr + 2 * x + 1) = res2;
1096 broadcast_input, non_broadcast_input, output);
1101 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1102 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1104 Iterator input1(in1, input1_win);
1105 Iterator input2(in2, input2_win);
1106 Iterator output(out, win);
1110 const auto input1_ptr =
reinterpret_cast<const float *
>(input1.ptr());
1111 const auto input2_ptr =
reinterpret_cast<const float *
>(input2.ptr());
1112 const auto output_ptr =
reinterpret_cast<float *
>(output.ptr());
1115 int x = window_start_x;
1116 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1121 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1140 for(; x < window_end_x; ++x)
1142 const auto a0 = *(input1_ptr + 2 * x);
1143 const auto a1 = *(input1_ptr + 2 * x + 1);
1144 const auto b0 = *(input2_ptr + 2 * x);
1145 const auto b1 = *(input2_ptr + 2 * x + 1);
1146 auto res1 = a0 * b0 - a1 * b1;
1147 auto res2 = a0 * b1 + a1 * b0;
1148 *(output_ptr + 2 * x) = res1;
1149 *(output_ptr + 2 * x + 1) = res2;
1152 input1, input2, output);
1156 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1157 void mul_F16_F16_F16(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
float scale)
1160 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1161 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1164 Window win = window;
1166 constexpr
int window_step_x = 16;
1167 const auto window_start_x =
static_cast<int>(window.x().start());
1168 const auto window_end_x =
static_cast<int>(window.x().end());
1169 const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
1170 if(is_broadcast_across_x)
1172 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1173 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1174 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1175 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
1176 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
1178 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1179 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1180 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1181 Iterator output(out, win);
1184 const auto non_broadcast_input_ptr =
reinterpret_cast<const float16_t *
>(non_broadcast_input.ptr());
1185 const auto output_ptr =
reinterpret_cast<float16_t *
>(output.ptr());
1186 const auto broadcast_value = *
reinterpret_cast<const float16_t *
>(broadcast_input.ptr());
1187 const float16x8x2_t broadcast_value_vec =
1190 vdupq_n_f16(broadcast_value),
1191 vdupq_n_f16(broadcast_value),
1194 const auto scale_vec = vdupq_n_f16(scale);
1196 int x = window_start_x;
1197 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1199 const float16x8x2_t non_broadcast_v =
1202 vld1q_f16(non_broadcast_input_ptr + x),
1203 vld1q_f16(non_broadcast_input_ptr + x + 8),
1206 const float16x8x2_t result =
1213 vst1q_f16(output_ptr + x, result.val[0]);
1214 vst1q_f16(output_ptr + x + 8, result.val[1]);
1217 for(; x < window_end_x; ++x)
1219 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1220 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1223 broadcast_input, non_broadcast_input, output);
1227 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1228 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1229 Iterator input1(in1, input1_win);
1230 Iterator input2(in2, input2_win);
1231 Iterator output(out, win);
1234 const auto input1_ptr =
reinterpret_cast<const float16_t *
>(input1.ptr());
1235 const auto input2_ptr =
reinterpret_cast<const float16_t *
>(input2.ptr());
1236 const auto output_ptr =
reinterpret_cast<float16_t *
>(output.ptr());
1238 int x = window_start_x;
1239 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1241 const float16x8x2_t ta1 =
1244 vld1q_f16(input1_ptr + x),
1245 vld1q_f16(input1_ptr + x + 8),
1248 const float16x8x2_t ta2 =
1251 vld1q_f16(input2_ptr + x),
1252 vld1q_f16(input2_ptr + x + 8),
1255 const float16x8_t scale_vec = vdupq_n_f16(scale);
1256 const float16x8x2_t result =
1263 vst1q_f16(output_ptr + x, result.val[0]);
1264 vst1q_f16(output_ptr + x + 8, result.val[1]);
1267 for(; x < window_end_x; ++x)
1269 const auto ta1 = *(input1_ptr + x);
1270 const auto ta2 = *(input2_ptr + x);
1271 *(output_ptr + x) = ta1 * ta2 * scale;
1274 input1, input2, output);
1279 template <
bool is_scale255,
bool is_sat>
1280 void mul_U8_U8_S16(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int n)
1283 Window win = window;
1284 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1285 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1289 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1290 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1292 Iterator input1(in1, input1_win);
1293 Iterator input2(in2, input2_win);
1294 Iterator output(out, win);
1296 const int window_step_x = 16 /
sizeof(uint8_t);
1297 const auto window_start_x =
static_cast<int>(window.x().start());
1298 const auto window_end_x =
static_cast<int>(window.x().end());
1302 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.ptr());
1303 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
1304 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.ptr());
1307 int x = window_start_x;
1308 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1313 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1314 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1315 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1316 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1320 tmp_low = scale255_U16_U16(tmp_low);
1321 tmp_high = scale255_U16_U16(tmp_high);
1325 const int16x8_t vn = vdupq_n_s16(-n);
1329 tmp_low = vqshlq_u16(tmp_low, vn);
1330 tmp_high = vqshlq_u16(tmp_high, vn);
1334 tmp_low = vshlq_u16(tmp_low, vn);
1335 tmp_high = vshlq_u16(tmp_high, vn);
1341 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1343 tmp_low = vminq_u16(tmp_low, max);
1344 tmp_high = vminq_u16(tmp_high, max);
1347 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1348 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
1352 for(; x < window_end_x; ++x)
1354 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
1358 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
1359 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
1368 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1371 *(output_ptr + x) = static_cast<int16_t>(tmp);
1374 input1, input2, output);
1377 template <
bool is_scale255,
bool is_sat>
1378 void mul_S16_U8_S16(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int n)
1381 Window win = window;
1382 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
1383 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
1387 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1388 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1390 Iterator input1(in1, input1_win);
1391 Iterator input2(in2, input2_win);
1392 Iterator output(out, win);
1394 const int window_step_x = 16;
1395 const auto window_start_x =
static_cast<int>(window.x().start());
1396 const auto window_end_x =
static_cast<int>(window.x().end());
1400 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.ptr());
1401 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
1402 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.ptr());
1405 int x = window_start_x;
1406 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1408 const int16x8x2_t ta1 =
1411 vld1q_s16(input1_ptr + x),
1412 vld1q_s16(input1_ptr + x + 8),
1415 const uint8x8x2_t ta2u =
1418 vld1_u8(input2_ptr + x),
1419 vld1_u8(input2_ptr + x + 8),
1422 const int16x8x2_t ta2 =
1425 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1426 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1430 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1432 vst1q_s16(output_ptr + x, result.val[0]);
1433 vst1q_s16(output_ptr + x + 8, result.val[1]);
1437 for(; x < window_end_x; ++x)
1439 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
1443 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
1445 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
1455 uint32_t mask = (1u << n) - 1;
1456 tmp = (tmp +
static_cast<int32_t
>(mask)) >> n;
1461 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1463 *(output_ptr + x) = static_cast<int16_t>(tmp);
1466 input1, input2, output);
1469 template <
bool is_scale255,
bool is_sat>
1470 void mul_U8_S16_S16(
const ITensor *in1,
const ITensor *in2, ITensor *out,
const Window &window,
int n)
1473 mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);
1478 : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
1490 const TensorShape &out_shape = broadcast_pair.first;
1497 _scale_exponent = 0;
1498 _func_quantized =
nullptr;
1499 _func_int =
nullptr;
1500 _func_float =
nullptr;
1502 bool is_scale_255 =
false;
1504 if(std::abs(scale - scale255_constant) < 0.00001f)
1506 is_scale_255 =
true;
1512 std::frexp(scale, &exponent);
1516 _scale_exponent = std::abs(exponent - 1);
1529 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1535 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1542 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1546 _func_int = &mul_QSYMM16_QSYMM16_S32;
1554 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1558 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1565 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1569 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1576 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1584 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1588 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1595 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1599 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1606 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1610 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1614 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1616 _func_float = &mul_F16_F16_F16;
1620 _func_float = &mul_F32_F32_F32;
1632 INEKernel::configure(win);
1654 if(_func_quantized !=
nullptr)
1656 (*_func_quantized)(input1, input2, output,
window, _scale);
1658 else if(_func_int !=
nullptr)
1660 (*_func_int)(input1, input2, output,
window, _scale_exponent);
1665 (*_func_float)(input1, input2, output,
window, _scale);
1696 const TensorShape &out_shape = broadcast_pair.first;
1709 INEKernel::configure(win);
1730 c_mul_F32_F32_F32_n(input1, input2, output, window);
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 16 floating point values.
quantized, symmetric fixed-point 16-bit number
Rounds to nearest value; half rounds away from zero.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
1 channel, 1 U8 per channel
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
Initialise the kernel's input, output and border mode.
const ValidRegion valid_region
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
static std::pair< TensorShape, ValidRegion > broadcast_shape_and_valid_region(const Infos &... infos)
If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of...
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NEComplexPixelWiseMultip...
decltype(strategy::transforms) typedef type
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Static function to check if given info will lead to a valid configuration of NEPixelWiseMultiplicatio...
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
size_t total_size() const
Collapses all dimensions to a single linear total size.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
RoundingPolicy
Rounding method.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Rounds to nearest value; half rounds to nearest even.
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
int16x8x2_t qsymm16x8x2_t
16 bit quantized symmetric vector with 16 elements
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vrev64(const uint8x8_t &a)
uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
static float dequantize(QUANTIZED_TYPE value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Initialise the kernel's input, output and border mode.
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Container for valid region of a window.
int16_t qsymm16_t
16 bit quantized symmetric scalar value
DataType
Available data types.
static QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy=RoundingPolicy::TO_NEAREST_UP)
Quantize a value given a 8-bit asymmetric quantization scheme.
Truncates the least significant values that are lost in operations.
Describe a multidimensional execution window.
ConvertPolicy
Policy to handle overflow.
virtual size_t num_channels() const =0
The number of channels for each tensor element.
NEPixelWiseMultiplicationKernel()
Default constructor.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)