45 const float scale255_constant = 1.f / 255.f;
46 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
47 const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
49 inline Status validate_arguments(
const ITensorInfo *src1,
const ITensorInfo *src2,
const ITensorInfo *
dst,
float scale,
ConvertPolicy overflow_policy,
RoundingPolicy rounding_policy)
68 if(dst->total_size() > 0)
75 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
81 ,
"Invalid data type combination");
86 if(std::abs(scale - scale255_constant) < 0.00001f)
90 "Scale == 1/255 is not supported if input and dst are of data type S32");
97 const float normalized_mantissa = std::frexp(scale, &exponent);
102 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
"Scale value not supported (Should be 1/(2^n) or 1/255");
115 inline int32x4_t scale255_S32_S32(int32x4_t in)
118 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
122 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
125 inline uint16x8_t scale255_U16_U16(uint16x8_t in)
127 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
128 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
129 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
132 template <
typename T>
133 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>
::type 134 vquantize(float32x4x4_t val,
const UniformQuantizationInfo &
info)
139 template <
typename T>
140 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>
::type 141 vquantize(float32x4x4_t val,
const UniformQuantizationInfo &
info)
146 template <
typename T>
147 void mul_saturate_quantized_8(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
float scale)
151 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
152 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
157 const int window_step_x = 16 /
sizeof(T);
158 const auto window_start_x =
static_cast<int>(window.x().start());
159 const auto window_end_x =
static_cast<int>(window.x().end());
160 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
162 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
163 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale /
scale, output_qua_info.offset };
165 if(is_broadcast_across_x)
167 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
168 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
169 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
170 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
171 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
172 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
173 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
176 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
178 Iterator broadcast_input(broadcast_tensor, broadcast_win);
179 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
180 Iterator
dst(out, win);
182 using ExactTagType =
typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
186 const auto non_broadcast_input_ptr =
reinterpret_cast<const T *
>(non_broadcast_input.ptr());
187 const auto output_ptr =
reinterpret_cast<T *
>(dst.ptr());
189 const auto broadcast_value = *
reinterpret_cast<const T *
>(broadcast_input.ptr());
190 const auto broadcast_value_vec =
wrapper::vdup_n(broadcast_value, ExactTagType{});
193 int x = window_start_x;
194 for(; x <= (window_end_x - window_step_x); x += window_step_x)
196 const auto non_broadcast_v =
wrapper::vloadq(non_broadcast_input_ptr + x);
199 const float32x4x4_t in1_f32x4x4 =
vdequantize(non_broadcast_v, non_broadcast_qinfo);
200 const float32x4x4_t in2_f32x4x4 =
vdequantize(broadcast_value_vec, broadcast_qinfo);
202 const float32x4x4_t out_f32x4x4 =
204 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
205 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
206 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
207 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
211 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
216 for(; x < window_end_x; ++x)
219 const T src1 = *(non_broadcast_input_ptr + x);
222 const float tmp_f = tmp_in1 * tmp_in2;
226 *(output_ptr + x) = tmp_qua;
229 broadcast_input, non_broadcast_input,
dst);
233 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
234 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
237 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
238 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
240 Iterator input1(src1, input1_win);
241 Iterator input2(src2, input2_win);
242 Iterator
dst(out, win);
246 const auto input1_ptr =
reinterpret_cast<const T *
>(input1.ptr());
247 const auto input2_ptr =
reinterpret_cast<const T *
>(input2.ptr());
248 const auto output_ptr =
reinterpret_cast<T *
>(dst.ptr());
251 int x = window_start_x;
252 for(; x <= (window_end_x - window_step_x); x += window_step_x)
258 const float32x4x4_t in1_f32x4x4 =
vdequantize(input1_q, input1_qua_info);
259 const float32x4x4_t in2_f32x4x4 =
vdequantize(input2_q, input2_qua_info);
261 const float32x4x4_t out_f32x4x4 =
263 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
264 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
265 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
266 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
270 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
275 for(; x < window_end_x; ++x)
278 const T src1 = *(input1_ptr + x);
279 const T src2 = *(input2_ptr + x);
282 const float tmp_f = tmp_in1 * tmp_in2;
286 *(output_ptr + x) = tmp_qua;
289 input1, input2,
dst);
293 void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
float scale)
295 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
296 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
297 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
301 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
302 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
306 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
307 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
309 Iterator input1(src1, input1_win);
310 Iterator input2(src2, input2_win);
311 Iterator
dst(out, win);
313 const int window_step_x = 16;
314 const auto window_start_x =
static_cast<int>(window.x().start());
315 const auto window_end_x =
static_cast<int>(window.x().end());
317 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale /
scale, output_qua_info.offset };
321 const auto input1_ptr =
reinterpret_cast<const qsymm16_t *
>(input1.ptr());
322 const auto input2_ptr =
reinterpret_cast<const qsymm16_t *
>(input2.ptr());
323 const auto output_ptr =
reinterpret_cast<qsymm16_t *
>(dst.ptr());
326 int x = window_start_x;
327 for(; x <= (window_end_x - window_step_x); x += window_step_x)
332 vld1q_s16(input1_ptr + x),
333 vld1q_s16(input1_ptr + x + 8),
339 vld1q_s16(input2_ptr + x),
340 vld1q_s16(input2_ptr + x + 8),
345 const float32x4x4_t in1_f32x4x4 =
vdequantize(input1_q, input1_qua_info);
346 const float32x4x4_t in2_f32x4x4 =
vdequantize(input2_q, input2_qua_info);
348 const float32x4x4_t out_f32x4x4 =
350 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
351 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
352 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
353 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
357 vst1q_s16(output_ptr + x, result.val[0]);
358 vst1q_s16(output_ptr + x + 8, result.val[1]);
362 for(; x < window_end_x; ++x)
365 float tmp_in1 =
static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
366 float tmp_in2 =
static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
367 float tmp_f = tmp_in1 * tmp_in2;
370 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
371 qsymm16_t tmp_qua =
static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
372 *(output_ptr + x) = tmp_qua;
375 input1, input2,
dst);
378 void mul_QSYMM16_QSYMM16_S32(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int scale)
384 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
385 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
389 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
390 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
392 Iterator input1(src1, input1_win);
393 Iterator input2(src2, input2_win);
394 Iterator
dst(out, win);
396 const int window_step_x = 16;
397 const auto window_start_x =
static_cast<int>(window.x().start());
398 const auto window_end_x =
static_cast<int>(window.x().end());
402 const auto input1_ptr =
reinterpret_cast<const qsymm16_t *
>(input1.ptr());
403 const auto input2_ptr =
reinterpret_cast<const qsymm16_t *
>(input2.ptr());
404 const auto output_ptr =
reinterpret_cast<int32_t *
>(dst.ptr());
407 int x = window_start_x;
408 for(; x <= (window_end_x - window_step_x); x += window_step_x)
413 vld1q_s16(input1_ptr + x),
414 vld1q_s16(input1_ptr + x + 8),
420 vld1q_s16(input2_ptr + x),
421 vld1q_s16(input2_ptr + x + 8),
425 const int32x4x4_t in1_s32 =
428 vmovl_s16(vget_low_s16(input1_q.val[0])),
429 vmovl_s16(vget_high_s16(input1_q.val[0])),
430 vmovl_s16(vget_low_s16(input1_q.val[1])),
431 vmovl_s16(vget_high_s16(input1_q.val[1])),
434 const int32x4x4_t in2_s32 =
437 vmovl_s16(vget_low_s16(input2_q.val[0])),
438 vmovl_s16(vget_high_s16(input2_q.val[0])),
439 vmovl_s16(vget_low_s16(input2_q.val[1])),
440 vmovl_s16(vget_high_s16(input2_q.val[1])),
444 const int32x4x4_t result =
447 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
448 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
449 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
450 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
454 vst1q_s32(output_ptr + x, result.val[0]);
455 vst1q_s32(output_ptr + x + 4, result.val[1]);
456 vst1q_s32(output_ptr + x + 8, result.val[2]);
457 vst1q_s32(output_ptr + x + 12, result.val[3]);
461 for(; x < window_end_x; ++x)
463 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
464 *(output_ptr + x) = tmp;
467 input1, input2,
dst);
470 template <
bool is_scale255,
bool is_sat>
471 void mul_U8_U8_U8(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
475 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
476 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
480 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
481 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
483 Iterator input1(src1, input1_win);
484 Iterator input2(src2, input2_win);
485 Iterator
dst(out, win);
487 const int window_step_x = 16 /
sizeof(uint8_t);
488 const auto window_start_x =
static_cast<int>(window.x().start());
489 const auto window_end_x =
static_cast<int>(window.x().end());
493 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.ptr());
494 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
495 const auto output_ptr =
reinterpret_cast<uint8_t *
>(dst.ptr());
498 int x = window_start_x;
499 for(; x <= (window_end_x - window_step_x); x += window_step_x)
504 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
505 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
506 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
507 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
509 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
510 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
514 tmp1_high = scale255_U16_U16(tmp1_high);
515 tmp1_low = scale255_U16_U16(tmp1_low);
519 const int16x8_t vn = vdupq_n_s16(-n);
523 tmp1_high = vqshlq_u16(tmp1_high, vn);
524 tmp1_low = vqshlq_u16(tmp1_low, vn);
528 tmp1_high = vshlq_u16(tmp1_high, vn);
529 tmp1_low = vshlq_u16(tmp1_low, vn);
534 vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
538 vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
543 for(; x < window_end_x; ++x)
545 uint16_t tmp =
static_cast<uint16_t
>(*(input1_ptr + x)) *
static_cast<uint16_t
>(*(input2_ptr + x));
549 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
550 tmp =
static_cast<uint16_t
>(tmp_f + 0.5f);
556 if(is_sat && tmp > 255)
560 *(output_ptr + x) = static_cast<uint8_t>(tmp);
563 input1, input2,
dst);
566 template <
bool is_scale255,
bool is_sat>
567 inline int16x8_t mul_S16_S16_S16_n_loop(
const int16x8_t &src1,
const int16x8_t &src2,
int n)
569 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
570 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
571 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
572 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
574 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
575 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
579 tmp1_high = scale255_S32_S32(tmp1_high);
580 tmp1_low = scale255_S32_S32(tmp1_low);
585 const int32x4_t vn = vdupq_n_s32(-n);
587 const int32x4_t vnl = vdupq_n_s32(n);
589 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
590 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
591 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
592 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
593 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
594 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
595 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
596 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
599 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
600 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
604 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
605 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
611 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
615 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
619 template <
bool is_scale255,
bool is_sat>
620 inline int16x8x2_t mul_S16_S16_S16_n_k(
const int16x8x2_t &src1,
const int16x8x2_t &src2,
int n)
622 const int16x8x2_t result =
626 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
628 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
635 template <
bool is_scale255,
bool is_sat>
636 void mul_S16_S16_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
640 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
641 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
645 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
646 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
648 Iterator input1(src1, input1_win);
649 Iterator input2(src2, input2_win);
650 Iterator
dst(out, win);
652 const int window_step_x = 16;
653 const auto window_start_x =
static_cast<int>(window.x().start());
654 const auto window_end_x =
static_cast<int>(window.x().end());
658 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.ptr());
659 const auto input2_ptr =
reinterpret_cast<const int16_t *
>(input2.ptr());
660 const auto output_ptr =
reinterpret_cast<int16_t *
>(dst.ptr());
663 int x = window_start_x;
664 for(; x <= (window_end_x - window_step_x); x += window_step_x)
666 const int16x8x2_t ta1 =
669 vld1q_s16(input1_ptr + x),
670 vld1q_s16(input1_ptr + x + 8),
673 const int16x8x2_t ta2 =
676 vld1q_s16(input2_ptr + x),
677 vld1q_s16(input2_ptr + x + 8),
680 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
682 vst1q_s16(output_ptr + x, result.val[0]);
683 vst1q_s16(output_ptr + x + 8, result.val[1]);
687 for(; x < window_end_x; ++x)
689 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
693 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
695 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
705 uint32_t mask = (1u << n) - 1;
706 tmp = (tmp +
static_cast<int32_t
>(mask)) >> n;
711 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
713 *(output_ptr + x) = static_cast<int16_t>(tmp);
716 input1, input2,
dst);
719 template <
bool is_sat>
720 inline int32x4_t mul_S32_S32_S32_n_loop(
const int32x4_t &src1,
const int32x4_t &src2,
int n)
722 const int32x2_t input1_1 = vget_low_s32(src1);
723 const int32x2_t input2_1 = vget_low_s32(src2);
724 const int32x2_t input1_2 = vget_high_s32(src1);
725 const int32x2_t input2_2 = vget_high_s32(src2);
727 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
728 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
732 const int64x2_t vn = vdupq_n_s64(-n);
734 const int64x2_t vnl = vdupq_n_s64(n);
736 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
737 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
738 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
739 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
741 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
742 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
743 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
744 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
747 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
748 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
749 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
753 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
754 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
755 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
759 template <
bool is_sat>
760 inline int32x4x2_t mul_S32_S32_S32_n_k(
const int32x4x2_t &src1,
const int32x4x2_t &src2,
int n)
762 const int32x4x2_t result =
766 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
768 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
775 template <
bool is_sat>
776 void mul_S32_S32_S32(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
779 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
780 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
786 const int window_step_x = 8;
787 const auto window_start_x =
static_cast<int>(window.x().start());
788 const auto window_end_x =
static_cast<int>(window.x().end());
789 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
791 if(is_broadcast_across_x)
793 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
794 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
795 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
796 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
797 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
800 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
802 Iterator broadcast_input(broadcast_tensor, broadcast_win);
803 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
804 Iterator
dst(out, win);
808 const auto non_broadcast_input_ptr =
reinterpret_cast<const int32_t *
>(non_broadcast_input.ptr());
809 const auto output_ptr =
reinterpret_cast<int32_t *
>(dst.ptr());
811 const int32_t broadcast_value = *
reinterpret_cast<const int32_t *
>(broadcast_input.ptr());
812 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
815 int x = window_start_x;
816 for(; x <= (window_end_x - window_step_x); x += window_step_x)
818 const int32x4x2_t broadcast_v =
825 const int32x4x2_t non_broadcast_v =
828 vld1q_s32(non_broadcast_input_ptr + x),
829 vld1q_s32(non_broadcast_input_ptr + x + 4),
832 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
834 vst1q_s32(output_ptr + x, result.val[0]);
835 vst1q_s32(output_ptr + x + 4, result.val[1]);
839 for(; x < window_end_x; ++x)
841 int64_t tmp =
static_cast<int64_t
>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
849 uint64_t mask = ((uint64_t)1u << n) - 1;
850 tmp = (tmp +
static_cast<int64_t
>(mask)) >> n;
854 tmp = utility::clamp<int64_t, int32_t>(tmp);
856 *(output_ptr + x) = static_cast<int32_t>(tmp);
859 broadcast_input, non_broadcast_input,
dst);
864 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
865 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
867 Iterator input1(src1, input1_win);
868 Iterator input2(src2, input2_win);
869 Iterator
dst(out, win);
873 const auto input1_ptr =
reinterpret_cast<const int32_t *
>(input1.ptr());
874 const auto input2_ptr =
reinterpret_cast<const int32_t *
>(input2.ptr());
875 const auto output_ptr =
reinterpret_cast<int32_t *
>(dst.ptr());
878 int x = window_start_x;
879 for(; x <= (window_end_x - window_step_x); x += window_step_x)
881 const int32x4x2_t ta1 =
884 vld1q_s32(input1_ptr + x),
885 vld1q_s32(input1_ptr + x + 4),
888 const int32x4x2_t ta2 =
891 vld1q_s32(input2_ptr + x),
892 vld1q_s32(input2_ptr + x + 4),
895 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
897 vst1q_s32(output_ptr + x, result.val[0]);
898 vst1q_s32(output_ptr + x + 4, result.val[1]);
902 for(; x < window_end_x; ++x)
904 int64_t tmp =
static_cast<int64_t
>(*(input1_ptr + x)) *
static_cast<int64_t
>(*(input2_ptr + x));
912 uint64_t mask = ((uint64_t)1u << n) - 1;
913 tmp = (tmp +
static_cast<int64_t
>(mask)) >> n;
917 tmp = utility::clamp<int64_t, int32_t>(tmp);
919 *(output_ptr + x) = static_cast<int32_t>(tmp);
922 input1, input2,
dst);
926 void mul_F32_F32_F32(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
float scale)
929 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
930 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
936 constexpr
int window_step_x = 16 /
sizeof(float);
937 const auto window_start_x =
static_cast<int>(window.x().start());
938 const auto window_end_x =
static_cast<int>(window.x().end());
939 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
941 using ExactTagType =
typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
943 if(is_broadcast_across_x)
945 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
946 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
947 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
948 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
949 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
952 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
954 Iterator broadcast_input(broadcast_tensor, broadcast_win);
955 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
956 Iterator
dst(out, win);
960 const auto non_broadcast_input_ptr =
reinterpret_cast<const float *
>(non_broadcast_input.ptr());
961 const auto output_ptr =
reinterpret_cast<float *
>(dst.ptr());
963 const float broadcast_value = *
reinterpret_cast<const float *
>(broadcast_input.ptr());
964 const auto broadcast_value_vec =
wrapper::vdup_n(broadcast_value, ExactTagType{});
968 int x = window_start_x;
969 for(; x <= (window_end_x - window_step_x); x += window_step_x)
971 const auto non_broadcast_v =
wrapper::vloadq(non_broadcast_input_ptr + x);
977 for(; x < window_end_x; ++x)
979 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
980 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
983 broadcast_input, non_broadcast_input,
dst);
988 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
989 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
991 Iterator input1(src1, input1_win);
992 Iterator input2(src2, input2_win);
993 Iterator
dst(out, win);
997 const auto input1_ptr =
reinterpret_cast<const float *
>(input1.ptr());
998 const auto input2_ptr =
reinterpret_cast<const float *
>(input2.ptr());
999 const auto output_ptr =
reinterpret_cast<float *
>(dst.ptr());
1002 int x = window_start_x;
1003 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1013 for(; x < window_end_x; ++x)
1015 const auto ta1 = *(input1_ptr + x);
1016 const auto ta2 = *(input2_ptr + x);
1017 *(output_ptr + x) = ta1 * ta2 * scale;
1020 input1, input2,
dst);
1024 void c_mul_F32_F32_F32_n(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window)
1027 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1028 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1031 Window win = window;
1034 constexpr
int window_step_x = 8 /
sizeof(float);
1035 const auto window_start_x =
static_cast<int>(window.x().start());
1036 const auto window_end_x =
static_cast<int>(window.x().end());
1037 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
1039 using ExactTagType =
typename wrapper::traits::neon_vector<float, 2>::tag_type;
1041 if(is_broadcast_across_x)
1043 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1044 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1045 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1046 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1047 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
1050 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1052 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1053 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1054 Iterator
dst(out, win);
1058 const auto non_broadcast_input_ptr =
reinterpret_cast<const float *
>(non_broadcast_input.ptr());
1059 const auto output_ptr =
reinterpret_cast<float *
>(dst.ptr());
1061 const float broadcast_value = *
reinterpret_cast<const float *
>(broadcast_input.ptr());
1064 int x = window_start_x;
1065 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1068 float32x4_t
b = vdupq_n_f32(broadcast_value);
1070 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1087 for(; x < window_end_x; ++x)
1089 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1090 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1091 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1092 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1093 *(output_ptr + 2 * x) = res1;
1094 *(output_ptr + 2 * x + 1) = res2;
1097 broadcast_input, non_broadcast_input,
dst);
1102 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1103 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1105 Iterator input1(src1, input1_win);
1106 Iterator input2(src2, input2_win);
1107 Iterator
dst(out, win);
1111 const auto input1_ptr =
reinterpret_cast<const float *
>(input1.ptr());
1112 const auto input2_ptr =
reinterpret_cast<const float *
>(input2.ptr());
1113 const auto output_ptr =
reinterpret_cast<float *
>(dst.ptr());
1116 int x = window_start_x;
1117 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1122 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1141 for(; x < window_end_x; ++x)
1143 const auto a0 = *(input1_ptr + 2 * x);
1144 const auto a1 = *(input1_ptr + 2 * x + 1);
1145 const auto b0 = *(input2_ptr + 2 * x);
1146 const auto b1 = *(input2_ptr + 2 * x + 1);
1147 auto res1 = a0 * b0 - a1 * b1;
1148 auto res2 = a0 * b1 + a1 * b0;
1149 *(output_ptr + 2 * x) = res1;
1150 *(output_ptr + 2 * x + 1) = res2;
1153 input1, input2,
dst);
1157 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1158 void mul_F16_F16_F16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
float scale)
1161 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1162 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1165 Window win = window;
1167 constexpr
int window_step_x = 16;
1168 const auto window_start_x =
static_cast<int>(window.x().start());
1169 const auto window_end_x =
static_cast<int>(window.x().end());
1170 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
1171 if(is_broadcast_across_x)
1173 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1174 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1175 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1176 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1177 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
1179 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1180 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1181 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1182 Iterator
dst(out, win);
1185 const auto non_broadcast_input_ptr =
reinterpret_cast<const float16_t *
>(non_broadcast_input.ptr());
1186 const auto output_ptr =
reinterpret_cast<float16_t *
>(dst.ptr());
1187 const auto broadcast_value = *
reinterpret_cast<const float16_t *
>(broadcast_input.ptr());
1188 const float16x8x2_t broadcast_value_vec =
1191 vdupq_n_f16(broadcast_value),
1192 vdupq_n_f16(broadcast_value),
1195 const auto scale_vec = vdupq_n_f16(scale);
1197 int x = window_start_x;
1198 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1200 const float16x8x2_t non_broadcast_v =
1203 vld1q_f16(non_broadcast_input_ptr + x),
1204 vld1q_f16(non_broadcast_input_ptr + x + 8),
1207 const float16x8x2_t result =
1214 vst1q_f16(output_ptr + x, result.val[0]);
1215 vst1q_f16(output_ptr + x + 8, result.val[1]);
1218 for(; x < window_end_x; ++x)
1220 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1221 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1224 broadcast_input, non_broadcast_input,
dst);
1228 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1229 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1230 Iterator input1(src1, input1_win);
1231 Iterator input2(src2, input2_win);
1232 Iterator
dst(out, win);
1235 const auto input1_ptr =
reinterpret_cast<const float16_t *
>(input1.ptr());
1236 const auto input2_ptr =
reinterpret_cast<const float16_t *
>(input2.ptr());
1237 const auto output_ptr =
reinterpret_cast<float16_t *
>(dst.ptr());
1239 int x = window_start_x;
1240 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1242 const float16x8x2_t ta1 =
1245 vld1q_f16(input1_ptr + x),
1246 vld1q_f16(input1_ptr + x + 8),
1249 const float16x8x2_t ta2 =
1252 vld1q_f16(input2_ptr + x),
1253 vld1q_f16(input2_ptr + x + 8),
1256 const float16x8_t scale_vec = vdupq_n_f16(scale);
1257 const float16x8x2_t result =
1264 vst1q_f16(output_ptr + x, result.val[0]);
1265 vst1q_f16(output_ptr + x + 8, result.val[1]);
1268 for(; x < window_end_x; ++x)
1270 const auto ta1 = *(input1_ptr + x);
1271 const auto ta2 = *(input2_ptr + x);
1272 *(output_ptr + x) = ta1 * ta2 * scale;
1275 input1, input2,
dst);
1280 template <
bool is_scale255,
bool is_sat>
1281 void mul_U8_U8_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1284 Window win = window;
1285 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1286 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1290 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1291 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1293 Iterator input1(src1, input1_win);
1294 Iterator input2(src2, input2_win);
1295 Iterator
dst(out, win);
1297 const int window_step_x = 16 /
sizeof(uint8_t);
1298 const auto window_start_x =
static_cast<int>(window.x().start());
1299 const auto window_end_x =
static_cast<int>(window.x().end());
1303 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.ptr());
1304 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
1305 const auto output_ptr =
reinterpret_cast<int16_t *
>(dst.ptr());
1308 int x = window_start_x;
1309 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1314 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1315 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1316 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1317 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1321 tmp_low = scale255_U16_U16(tmp_low);
1322 tmp_high = scale255_U16_U16(tmp_high);
1326 const int16x8_t vn = vdupq_n_s16(-n);
1330 tmp_low = vqshlq_u16(tmp_low, vn);
1331 tmp_high = vqshlq_u16(tmp_high, vn);
1335 tmp_low = vshlq_u16(tmp_low, vn);
1336 tmp_high = vshlq_u16(tmp_high, vn);
1342 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1344 tmp_low = vminq_u16(tmp_low, max);
1345 tmp_high = vminq_u16(tmp_high, max);
1348 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1349 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
1353 for(; x < window_end_x; ++x)
1355 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
1359 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
1360 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
1369 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1372 *(output_ptr + x) = static_cast<int16_t>(tmp);
1375 input1, input2,
dst);
1378 template <
bool is_scale255,
bool is_sat>
1379 void mul_S16_U8_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1382 Window win = window;
1383 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1384 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1388 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1389 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1391 Iterator input1(src1, input1_win);
1392 Iterator input2(src2, input2_win);
1393 Iterator
dst(out, win);
1395 const int window_step_x = 16;
1396 const auto window_start_x =
static_cast<int>(window.x().start());
1397 const auto window_end_x =
static_cast<int>(window.x().end());
1401 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.ptr());
1402 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
1403 const auto output_ptr =
reinterpret_cast<int16_t *
>(dst.ptr());
1406 int x = window_start_x;
1407 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1409 const int16x8x2_t ta1 =
1412 vld1q_s16(input1_ptr + x),
1413 vld1q_s16(input1_ptr + x + 8),
1416 const uint8x8x2_t ta2u =
1419 vld1_u8(input2_ptr + x),
1420 vld1_u8(input2_ptr + x + 8),
1423 const int16x8x2_t ta2 =
1426 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1427 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1431 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1433 vst1q_s16(output_ptr + x, result.val[0]);
1434 vst1q_s16(output_ptr + x + 8, result.val[1]);
1438 for(; x < window_end_x; ++x)
1440 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
1444 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
1446 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
1456 uint32_t mask = (1u << n) - 1;
1457 tmp = (tmp +
static_cast<int32_t
>(mask)) >> n;
1462 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1464 *(output_ptr + x) = static_cast<int16_t>(tmp);
1467 input1, input2,
dst);
1470 template <
bool is_scale255,
bool is_sat>
1471 void mul_U8_S16_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1474 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
1491 _scale_exponent = 0;
1492 _func_quantized =
nullptr;
1493 _func_int =
nullptr;
1494 _func_float =
nullptr;
1496 bool is_scale_255 =
false;
1498 if(std::abs(scale - scale255_constant) < 0.00001f)
1500 is_scale_255 =
true;
1506 std::frexp(scale, &exponent);
1510 _scale_exponent = std::abs(exponent - 1);
1523 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1529 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1536 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1540 _func_int = &mul_QSYMM16_QSYMM16_S32;
1548 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1552 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1559 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1563 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1570 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1578 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1582 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1589 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1593 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1600 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1604 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1608 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1610 _func_float = &mul_F16_F16_F16;
1614 _func_float = &mul_F32_F32_F32;
1623 ICpuKernel::configure(win);
1645 if(_func_quantized !=
nullptr)
1647 (*_func_quantized)(src1, src2,
dst,
window, _scale);
1649 else if(_func_int !=
nullptr)
1651 (*_func_int)(src1, src2,
dst,
window, _scale_exponent);
1656 (*_func_float)(src1, src2,
dst,
window, _scale);
1661 return "CpuMulKernel";
1699 ICpuKernel::configure(win);
1720 c_mul_F32_F32_F32_n(src1, src2, dst, window);
1725 return "CpuComplexMulKernel";
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 16 floating point values.
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration.
quantized, symmetric fixed-point 16-bit number
Rounds to nearest value; half rounds away from zero.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
Initialise the kernel's src, dst and border mode.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
Dequantize a neon vector holding 8 quantized values.
1 channel, 1 U8 per channel
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Static function to check if given info will lead to a valid configuration.
Copyright (c) 2017-2021 Arm Limited.
static QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo)
Quantize a value given a 8-bit asymmetric quantization scheme.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
size_t total_size() const
Collapses all dimensions to a single linear total size.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
RoundingPolicy
Rounding method.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
1 channel, 1 S16 per channel
const char * name() const override
Name of the kernel.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Rounds to nearest value; half rounds to nearest even.
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Initialise the kernel's input, dst and border mode.
int16x8x2_t qsymm16x8x2_t
16 bit quantized symmetric vector with 16 elements
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
uint8x8_t vrev64(const uint8x8_t &a)
uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
static float dequantize(QUANTIZED_TYPE value, const UniformQuantizationInfo &qinfo)
Dequantize a value given a 8-bit asymmetric quantization scheme.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
Quantize a neon vector holding 8 floating point values.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
int16_t qsymm16_t
16 bit quantized symmetric scalar value
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
DataType
Available data types.
Truncates the least significant values that are lost in operations.
Describe a multidimensional execution window.
ConvertPolicy
Policy to handle overflow.
virtual size_t num_channels() const =0
The number of channels for each tensor element.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)