42 #if defined(ENABLE_FP32_KERNELS)
43 static constexpr
size_t default_mws_N1_fp32_neon = 22447;
44 static constexpr
size_t default_mws_V1_fp32_neon = 38982;
46 static constexpr
size_t default_mws_other_platforms_1d_tensor = 10240;
56 const float scale255_constant = 1.f / 255.f;
57 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
58 const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
61 const ITensorInfo *src2,
62 const ITensorInfo *
dst,
84 "ConvertPolicy cannot be WRAP if datatype is quantized");
87 if (
dst->total_size() > 0)
91 "Wrong shape for dst");
95 !(src1->data_type() == src2->data_type() && src2->data_type() ==
dst->data_type()) &&
101 ,
"Invalid data type combination");
105 "Unsupported scale for QSYMM16 inputs and S32 dst");
108 if (std::abs(
scale - scale255_constant) < 0.00001f)
114 "Scale == 1/255 is not supported if input and dst are of data type S32");
121 const float normalized_mantissa = std::frexp(
scale, &exponent);
127 "Scale value not supported (Should be 1/(2^n) or 1/255");
140 inline int32x4_t scale255_S32_S32(int32x4_t in)
143 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
147 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
150 inline uint16x8_t scale255_U16_U16(uint16x8_t in)
152 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
153 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
154 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
157 template <
typename T>
158 inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>
::type
159 vquantize(float32x4x4_t val,
const UniformQuantizationInfo &
info)
164 template <
typename T>
165 inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>
::type
166 vquantize(float32x4x4_t val,
const UniformQuantizationInfo &
info)
171 template <
typename T>
172 void mul_saturate_quantized_8(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
float scale)
176 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
177 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
182 const int window_step_x = 16 /
sizeof(T);
183 const auto window_start_x =
static_cast<int>(window.x().start());
184 const auto window_end_x =
static_cast<int>(window.x().end());
185 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
187 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
188 const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale /
scale, output_qua_info.offset};
190 if (is_broadcast_across_x)
192 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
193 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
194 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
195 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
196 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
197 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
198 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
201 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
203 Iterator broadcast_input(broadcast_tensor, broadcast_win);
204 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
205 Iterator
dst(out, win);
207 using ExactTagType =
typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
211 [&](
const Coordinates &)
213 const auto non_broadcast_input_ptr =
reinterpret_cast<const T *
>(non_broadcast_input.ptr());
214 const auto output_ptr =
reinterpret_cast<T *
>(
dst.ptr());
216 const auto broadcast_value = *
reinterpret_cast<const T *
>(broadcast_input.ptr());
217 const auto broadcast_value_vec =
wrapper::vdup_n(broadcast_value, ExactTagType{});
220 int x = window_start_x;
221 for (; x <= (window_end_x - window_step_x); x += window_step_x)
223 const auto non_broadcast_v =
wrapper::vloadq(non_broadcast_input_ptr + x);
226 const float32x4x4_t in1_f32x4x4 =
vdequantize(non_broadcast_v, non_broadcast_qinfo);
227 const float32x4x4_t in2_f32x4x4 =
vdequantize(broadcast_value_vec, broadcast_qinfo);
229 const float32x4x4_t out_f32x4x4 = {
230 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
231 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
232 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
233 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
237 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
242 for (; x < window_end_x; ++x)
245 const T src1 = *(non_broadcast_input_ptr + x);
248 const float tmp_f = tmp_in1 * tmp_in2;
252 *(output_ptr + x) = tmp_qua;
255 broadcast_input, non_broadcast_input,
dst);
259 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
260 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
263 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
264 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
266 Iterator input1(src1, input1_win);
267 Iterator input2(src2, input2_win);
268 Iterator
dst(out, win);
272 [&](
const Coordinates &)
274 const auto input1_ptr =
reinterpret_cast<const T *
>(input1.ptr());
275 const auto input2_ptr =
reinterpret_cast<const T *
>(input2.ptr());
276 const auto output_ptr =
reinterpret_cast<T *
>(
dst.ptr());
279 int x = window_start_x;
280 for (; x <= (window_end_x - window_step_x); x += window_step_x)
286 const float32x4x4_t in1_f32x4x4 =
vdequantize(input1_q, input1_qua_info);
287 const float32x4x4_t in2_f32x4x4 =
vdequantize(input2_q, input2_qua_info);
289 const float32x4x4_t out_f32x4x4 = {
290 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
291 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
292 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
293 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
297 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
302 for (; x < window_end_x; ++x)
305 const T src1 = *(input1_ptr + x);
306 const T src2 = *(input2_ptr + x);
309 const float tmp_f = tmp_in1 * tmp_in2;
313 *(output_ptr + x) = tmp_qua;
316 input1, input2,
dst);
320 bool mul_q8_neon_fixedpoint_possible(
const ITensorInfo *src0,
321 const ITensorInfo *src1,
322 const ITensorInfo *
dst,
325 const auto iq0 = src0->quantization_info().uniform();
326 const auto iq1 = src1->quantization_info().uniform();
327 const auto oq =
dst->quantization_info().uniform();
329 const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) *
scale;
331 if (multiplier < -8191.f || multiplier > 8191.f)
337 const auto offset_out = float(oq.offset);
339 const auto max_result = multiplier * (256) * (256) + offset_out;
341 if (max_result > 8191.f)
350 template <
typename ScalarType>
351 void mul_q8_neon_fixedpoint(
const ITensor *src0,
const ITensor *src1, ITensor *
dst,
const Window &window,
float scale)
353 const auto in0_info = src0->info();
354 const auto in1_info = src1->info();
356 const auto &in0_shape = in0_info->tensor_shape();
357 const auto &in1_shape = in1_info->tensor_shape();
360 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
361 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
367 constexpr
int window_step_x = 16;
368 const auto window_start_x = window.x().start();
369 const auto window_end_x = window.x().end();
370 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
372 const auto iq0_info = in0_info->quantization_info().uniform();
373 const auto iq1_info = in1_info->quantization_info().uniform();
374 const auto oq_info =
dst->info()->quantization_info().uniform();
376 const auto in0_offset = iq0_info.offset;
377 const auto in1_offset = iq1_info.offset;
378 const auto out_offset = oq_info.offset;
379 const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) *
scale;
381 constexpr int32_t two_pwr18i = 262144;
382 constexpr
float two_pwr18f = 262144.f;
384 const auto in0_offset_16p0 =
static_cast<int16_t
>(in0_offset);
385 const auto in1_offset_16p0 =
static_cast<int16_t
>(in1_offset);
386 const auto out_offset_14p18 =
static_cast<int32_t
>(out_offset * two_pwr18i);
387 const auto multiplier_14p18 =
static_cast<int32_t
>(multiplier * two_pwr18f);
389 if (is_broadcast_across_x)
393 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
394 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
395 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
396 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
397 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
399 const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
400 const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
402 const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
403 const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
405 const auto a_voffset_16p0 =
wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
410 Iterator a_input_it(a_tensor, a_win);
411 Iterator b_input_it(b_tensor, b_win);
412 Iterator out_it(
dst, win);
416 [&](
const Coordinates &)
418 const auto a_ptr =
reinterpret_cast<const ScalarType *
>(a_input_it.ptr());
419 const auto b_ptr =
reinterpret_cast<const ScalarType *
>(b_input_it.ptr());
420 const auto out_ptr =
reinterpret_cast<ScalarType *
>(out_it.ptr());
422 const auto b_val = *b_ptr;
423 const auto b_offseted_32p0 =
static_cast<int32_t
>(b_val - b_offset_16p0);
424 const auto b_voffseted_32p0 =
wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
426 const auto vmultiplier_14p18 =
wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
427 const auto voffsetout_14p18 =
wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
429 int x = window_start_x;
431 for (; x <= (window_end_x - window_step_x); x += window_step_x)
445 const auto vinnermul_32p0_00 =
wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
446 const auto vinnermul_32p0_01 =
wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
447 const auto vinnermul_32p0_10 =
wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
448 const auto vinnermul_32p0_11 =
wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
450 const auto vout_14p18_00 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
451 const auto vout_14p18_01 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
452 const auto vout_14p18_10 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
453 const auto vout_14p18_11 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
456 const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
457 const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
458 const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
459 const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
464 const auto out_ptr =
reinterpret_cast<ScalarType *
>(out_it.ptr());
466 const auto vout_8p0 =
467 wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
472 for (; x < window_end_x; ++x)
475 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
476 (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
480 multiplier * ((
float(a_ptr[x]) - a_offset) * (
float(b_val) - b_offset)) +
float(out_offset)));
484 a_input_it, b_input_it, out_it);
488 const auto voffset0_16p0 =
wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
489 const auto voffset1_16p0 =
wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
490 const auto voffsetout_14p18 =
wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
491 const auto vmultiplier_14p18 =
wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
497 Iterator in0_it(src0, in0_win);
498 Iterator in1_it(src1, in1_win);
499 Iterator out_it(
dst, win);
503 [&](
const Coordinates &)
505 const auto in0_ptr =
reinterpret_cast<const ScalarType *
>(in0_it.ptr());
506 const auto in1_ptr =
reinterpret_cast<const ScalarType *
>(in1_it.ptr());
507 const auto out_ptr =
reinterpret_cast<ScalarType *
>(out_it.ptr());
509 int x = window_start_x;
511 for (; x <= (window_end_x - window_step_x); x += window_step_x)
533 const auto vinnermul_32p0_00 =
wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
534 const auto vinnermul_32p0_01 =
wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
535 const auto vinnermul_32p0_10 =
wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
536 const auto vinnermul_32p0_11 =
wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
538 const auto vout_14p18_00 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
539 const auto vout_14p18_01 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
540 const auto vout_14p18_10 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
541 const auto vout_14p18_11 =
wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
544 const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
545 const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
546 const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
547 const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
553 const auto vout_8p0 =
554 wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
559 for (; x < window_end_x; ++x)
562 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
563 wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
564 (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
568 multiplier * ((
float(in0_ptr[x]) - in0_offset) * (
float(in1_ptr[x]) - in1_offset)) +
573 in0_it, in1_it, out_it);
577 void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
578 const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
float scale)
580 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
581 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
582 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
586 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
587 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
591 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
592 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
594 Iterator input1(src1, input1_win);
595 Iterator input2(src2, input2_win);
596 Iterator
dst(out, win);
598 const int window_step_x = 16;
599 const auto window_start_x =
static_cast<int>(window.x().start());
600 const auto window_end_x =
static_cast<int>(window.x().end());
602 const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale /
scale, output_qua_info.offset};
606 [&](
const Coordinates &)
608 const auto input1_ptr =
reinterpret_cast<const qsymm16_t *
>(input1.ptr());
609 const auto input2_ptr =
reinterpret_cast<const qsymm16_t *
>(input2.ptr());
610 const auto output_ptr =
reinterpret_cast<qsymm16_t *
>(
dst.ptr());
613 int x = window_start_x;
614 for (; x <= (window_end_x - window_step_x); x += window_step_x)
617 vld1q_s16(input1_ptr + x),
618 vld1q_s16(input1_ptr + x + 8),
621 vld1q_s16(input2_ptr + x),
622 vld1q_s16(input2_ptr + x + 8),
626 const float32x4x4_t in1_f32x4x4 =
vdequantize(input1_q, input1_qua_info);
627 const float32x4x4_t in2_f32x4x4 =
vdequantize(input2_q, input2_qua_info);
629 const float32x4x4_t out_f32x4x4 = {
630 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
631 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
632 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
633 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
637 vst1q_s16(output_ptr + x, result.val[0]);
638 vst1q_s16(output_ptr + x + 8, result.val[1]);
642 for (; x < window_end_x; ++x)
645 float tmp_in1 =
static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
646 float tmp_in2 =
static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
647 float tmp_f = tmp_in1 * tmp_in2;
650 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
652 static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
653 *(output_ptr + x) = tmp_qua;
656 input1, input2,
dst);
659 void mul_QSYMM16_QSYMM16_S32(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int scale)
665 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
666 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
670 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
671 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
673 Iterator input1(src1, input1_win);
674 Iterator input2(src2, input2_win);
675 Iterator
dst(out, win);
677 const int window_step_x = 16;
678 const auto window_start_x =
static_cast<int>(window.x().start());
679 const auto window_end_x =
static_cast<int>(window.x().end());
683 [&](
const Coordinates &)
685 const auto input1_ptr =
reinterpret_cast<const qsymm16_t *
>(input1.ptr());
686 const auto input2_ptr =
reinterpret_cast<const qsymm16_t *
>(input2.ptr());
687 const auto output_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
690 int x = window_start_x;
691 for (; x <= (window_end_x - window_step_x); x += window_step_x)
694 vld1q_s16(input1_ptr + x),
695 vld1q_s16(input1_ptr + x + 8),
698 vld1q_s16(input2_ptr + x),
699 vld1q_s16(input2_ptr + x + 8),
702 const int32x4x4_t in1_s32 = {{
703 vmovl_s16(vget_low_s16(input1_q.val[0])),
704 vmovl_s16(vget_high_s16(input1_q.val[0])),
705 vmovl_s16(vget_low_s16(input1_q.val[1])),
706 vmovl_s16(vget_high_s16(input1_q.val[1])),
708 const int32x4x4_t in2_s32 = {{
709 vmovl_s16(vget_low_s16(input2_q.val[0])),
710 vmovl_s16(vget_high_s16(input2_q.val[0])),
711 vmovl_s16(vget_low_s16(input2_q.val[1])),
712 vmovl_s16(vget_high_s16(input2_q.val[1])),
715 const int32x4x4_t result = {{
716 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
717 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
718 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
719 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
722 vst1q_s32(output_ptr + x, result.val[0]);
723 vst1q_s32(output_ptr + x + 4, result.val[1]);
724 vst1q_s32(output_ptr + x + 8, result.val[2]);
725 vst1q_s32(output_ptr + x + 12, result.val[3]);
729 for (; x < window_end_x; ++x)
731 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
732 *(output_ptr + x) = tmp;
735 input1, input2,
dst);
738 template <
bool is_scale255,
bool is_sat>
739 void mul_U8_U8_U8(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
743 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
744 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
748 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
749 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
751 Iterator input1(src1, input1_win);
752 Iterator input2(src2, input2_win);
753 Iterator
dst(out, win);
755 const int window_step_x = 16 /
sizeof(uint8_t);
756 const auto window_start_x =
static_cast<int>(window.x().start());
757 const auto window_end_x =
static_cast<int>(window.x().end());
761 [&](
const Coordinates &)
763 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.ptr());
764 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
765 const auto output_ptr =
reinterpret_cast<uint8_t *
>(
dst.ptr());
768 int x = window_start_x;
769 for (; x <= (window_end_x - window_step_x); x += window_step_x)
774 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
775 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
776 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
777 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
779 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
780 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
784 tmp1_high = scale255_U16_U16(tmp1_high);
785 tmp1_low = scale255_U16_U16(tmp1_low);
789 const int16x8_t vn = vdupq_n_s16(-n);
793 tmp1_high = vqshlq_u16(tmp1_high, vn);
794 tmp1_low = vqshlq_u16(tmp1_low, vn);
798 tmp1_high = vshlq_u16(tmp1_high, vn);
799 tmp1_low = vshlq_u16(tmp1_low, vn);
804 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
808 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
813 for (; x < window_end_x; ++x)
815 uint16_t tmp =
static_cast<uint16_t
>(*(input1_ptr + x)) *
static_cast<uint16_t
>(*(input2_ptr + x));
819 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
820 tmp =
static_cast<uint16_t
>(tmp_f + 0.5f);
826 if (is_sat && tmp > 255)
830 *(output_ptr + x) =
static_cast<uint8_t
>(tmp);
833 input1, input2,
dst);
836 template <
bool is_scale255,
bool is_sat>
837 inline int16x8_t mul_S16_S16_S16_n_loop(
const int16x8_t &src1,
const int16x8_t &src2,
int n)
839 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
840 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
841 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
842 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
844 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
845 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
849 tmp1_high = scale255_S32_S32(tmp1_high);
850 tmp1_low = scale255_S32_S32(tmp1_low);
855 const int32x4_t vn = vdupq_n_s32(-n);
857 const int32x4_t vnl = vdupq_n_s32(n);
859 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
860 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
861 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
862 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
863 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
864 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
865 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
866 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
869 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
870 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
874 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
875 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
881 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
885 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
889 template <
bool is_scale255,
bool is_sat>
890 inline int16x8x2_t mul_S16_S16_S16_n_k(
const int16x8x2_t &src1,
const int16x8x2_t &src2,
int n)
892 const int16x8x2_t result = {{
893 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
895 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
900 template <
bool is_scale255,
bool is_sat>
901 void mul_S16_S16_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
905 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
906 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
910 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
911 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
913 Iterator input1(src1, input1_win);
914 Iterator input2(src2, input2_win);
915 Iterator
dst(out, win);
917 const int window_step_x = 16;
918 const auto window_start_x =
static_cast<int>(window.x().start());
919 const auto window_end_x =
static_cast<int>(window.x().end());
923 [&](
const Coordinates &)
925 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.ptr());
926 const auto input2_ptr =
reinterpret_cast<const int16_t *
>(input2.ptr());
927 const auto output_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
930 int x = window_start_x;
931 for (; x <= (window_end_x - window_step_x); x += window_step_x)
933 const int16x8x2_t ta1 = {{
934 vld1q_s16(input1_ptr + x),
935 vld1q_s16(input1_ptr + x + 8),
937 const int16x8x2_t ta2 = {{
938 vld1q_s16(input2_ptr + x),
939 vld1q_s16(input2_ptr + x + 8),
941 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
943 vst1q_s16(output_ptr + x, result.val[0]);
944 vst1q_s16(output_ptr + x + 8, result.val[1]);
948 for (; x < window_end_x; ++x)
950 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
954 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
956 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
966 uint32_t mask = (1u << n) - 1;
967 tmp = (tmp +
static_cast<int32_t
>(mask)) >> n;
972 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
974 *(output_ptr + x) =
static_cast<int16_t
>(tmp);
977 input1, input2,
dst);
980 template <
bool is_sat>
981 inline int32x4_t mul_S32_S32_S32_n_loop(
const int32x4_t &src1,
const int32x4_t &src2,
int n)
983 const int32x2_t input1_1 = vget_low_s32(src1);
984 const int32x2_t input2_1 = vget_low_s32(src2);
985 const int32x2_t input1_2 = vget_high_s32(src1);
986 const int32x2_t input2_2 = vget_high_s32(src2);
988 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
989 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
993 const int64x2_t vn = vdupq_n_s64(-n);
995 const int64x2_t vnl = vdupq_n_s64(n);
997 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
998 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
999 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
1000 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
1002 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
1003 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
1004 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
1005 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
1008 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1009 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1010 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
1014 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1015 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1016 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
1020 template <
bool is_sat>
1021 inline int32x4x2_t mul_S32_S32_S32_n_k(
const int32x4x2_t &src1,
const int32x4x2_t &src2,
int n)
1023 const int32x4x2_t result = {{
1024 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
1026 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
1031 template <
bool is_sat>
1032 void mul_S32_S32_S32(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1035 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1036 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1039 Window win = window;
1042 const int window_step_x = 8;
1043 const auto window_start_x =
static_cast<int>(window.x().start());
1044 const auto window_end_x =
static_cast<int>(window.x().end());
1045 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
1047 if (is_broadcast_across_x)
1049 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1050 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1051 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1052 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1053 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
1056 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1058 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1059 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1060 Iterator
dst(out, win);
1064 [&](
const Coordinates &)
1066 const auto non_broadcast_input_ptr =
reinterpret_cast<const int32_t *
>(non_broadcast_input.ptr());
1067 const auto output_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
1069 const int32_t broadcast_value = *
reinterpret_cast<const int32_t *
>(broadcast_input.ptr());
1070 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
1073 int x = window_start_x;
1074 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1076 const int32x4x2_t broadcast_v = {{
1077 broadcast_value_vec,
1078 broadcast_value_vec,
1080 const int32x4x2_t non_broadcast_v = {{
1081 vld1q_s32(non_broadcast_input_ptr + x),
1082 vld1q_s32(non_broadcast_input_ptr + x + 4),
1084 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
1086 vst1q_s32(output_ptr + x, result.val[0]);
1087 vst1q_s32(output_ptr + x + 4, result.val[1]);
1091 for (; x < window_end_x; ++x)
1094 static_cast<int64_t
>(broadcast_value) *
static_cast<int64_t
>(*(non_broadcast_input_ptr + x));
1102 uint64_t mask = ((uint64_t)1u << n) - 1;
1103 tmp = (tmp +
static_cast<int64_t
>(mask)) >> n;
1107 tmp = utility::clamp<int64_t, int32_t>(tmp);
1109 *(output_ptr + x) =
static_cast<int32_t
>(tmp);
1112 broadcast_input, non_broadcast_input,
dst);
1117 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1118 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1120 Iterator input1(src1, input1_win);
1121 Iterator input2(src2, input2_win);
1122 Iterator
dst(out, win);
1126 [&](
const Coordinates &)
1128 const auto input1_ptr =
reinterpret_cast<const int32_t *
>(input1.ptr());
1129 const auto input2_ptr =
reinterpret_cast<const int32_t *
>(input2.ptr());
1130 const auto output_ptr =
reinterpret_cast<int32_t *
>(
dst.ptr());
1133 int x = window_start_x;
1134 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1136 const int32x4x2_t ta1 = {{
1137 vld1q_s32(input1_ptr + x),
1138 vld1q_s32(input1_ptr + x + 4),
1140 const int32x4x2_t ta2 = {{
1141 vld1q_s32(input2_ptr + x),
1142 vld1q_s32(input2_ptr + x + 4),
1144 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
1146 vst1q_s32(output_ptr + x, result.val[0]);
1147 vst1q_s32(output_ptr + x + 4, result.val[1]);
1151 for (; x < window_end_x; ++x)
1153 int64_t tmp =
static_cast<int64_t
>(*(input1_ptr + x)) *
static_cast<int64_t
>(*(input2_ptr + x));
1161 uint64_t mask = ((uint64_t)1u << n) - 1;
1162 tmp = (tmp +
static_cast<int64_t
>(mask)) >> n;
1166 tmp = utility::clamp<int64_t, int32_t>(tmp);
1168 *(output_ptr + x) =
static_cast<int32_t
>(tmp);
1171 input1, input2,
dst);
1175 void c_mul_F32_F32_F32_n(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window)
1178 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1179 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1182 Window win = window;
1185 constexpr
int window_step_x = 8 /
sizeof(float);
1186 const auto window_start_x =
static_cast<int>(window.x().start());
1187 const auto window_end_x =
static_cast<int>(window.x().end());
1188 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
1190 using ExactTagType =
typename wrapper::traits::neon_vector<float, 2>::tag_type;
1192 if (is_broadcast_across_x)
1194 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1195 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1196 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
1197 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1198 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
1201 non_broadcast_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1203 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1204 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
1205 Iterator
dst(out, win);
1209 [&](
const Coordinates &)
1211 const auto non_broadcast_input_ptr =
reinterpret_cast<const float *
>(non_broadcast_input.ptr());
1212 const auto output_ptr =
reinterpret_cast<float *
>(
dst.ptr());
1214 const float broadcast_value = *
reinterpret_cast<const float *
>(broadcast_input.ptr());
1217 int x = window_start_x;
1218 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1221 float32x4_t
b = vdupq_n_f32(broadcast_value);
1223 const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f};
1240 for (; x < window_end_x; ++x)
1242 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1243 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1244 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1245 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1246 *(output_ptr + 2 * x) = res1;
1247 *(output_ptr + 2 * x + 1) = res2;
1250 broadcast_input, non_broadcast_input,
dst);
1255 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1256 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1258 Iterator input1(src1, input1_win);
1259 Iterator input2(src2, input2_win);
1260 Iterator
dst(out, win);
1264 [&](
const Coordinates &)
1266 const auto input1_ptr =
reinterpret_cast<const float *
>(input1.ptr());
1267 const auto input2_ptr =
reinterpret_cast<const float *
>(input2.ptr());
1268 const auto output_ptr =
reinterpret_cast<float *
>(
dst.ptr());
1271 int x = window_start_x;
1272 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1277 const float32x4_t mask = {-1.0f, 1.0f, -1.0f, 1.0f};
1296 for (; x < window_end_x; ++x)
1298 const auto a0 = *(input1_ptr + 2 * x);
1299 const auto a1 = *(input1_ptr + 2 * x + 1);
1300 const auto b0 = *(input2_ptr + 2 * x);
1301 const auto b1 = *(input2_ptr + 2 * x + 1);
1302 auto res1 = a0 * b0 - a1 * b1;
1303 auto res2 = a0 * b1 + a1 * b0;
1304 *(output_ptr + 2 * x) = res1;
1305 *(output_ptr + 2 * x + 1) = res2;
1308 input1, input2,
dst);
1312 template <
bool is_scale255,
bool is_sat>
1313 void mul_U8_U8_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1316 Window win = window;
1317 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1318 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1322 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1323 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1325 Iterator input1(src1, input1_win);
1326 Iterator input2(src2, input2_win);
1327 Iterator
dst(out, win);
1329 const int window_step_x = 16 /
sizeof(uint8_t);
1330 const auto window_start_x =
static_cast<int>(window.x().start());
1331 const auto window_end_x =
static_cast<int>(window.x().end());
1335 [&](
const Coordinates &)
1337 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.ptr());
1338 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
1339 const auto output_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
1342 int x = window_start_x;
1343 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1348 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1349 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1350 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1351 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1355 tmp_low = scale255_U16_U16(tmp_low);
1356 tmp_high = scale255_U16_U16(tmp_high);
1360 const int16x8_t vn = vdupq_n_s16(-n);
1364 tmp_low = vqshlq_u16(tmp_low, vn);
1365 tmp_high = vqshlq_u16(tmp_high, vn);
1369 tmp_low = vshlq_u16(tmp_low, vn);
1370 tmp_high = vshlq_u16(tmp_high, vn);
1376 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1378 tmp_low = vminq_u16(tmp_low, max);
1379 tmp_high = vminq_u16(tmp_high, max);
1382 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1383 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
1387 for (; x < window_end_x; ++x)
1389 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
1393 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
1394 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
1403 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1406 *(output_ptr + x) =
static_cast<int16_t
>(tmp);
1409 input1, input2,
dst);
1412 template <
bool is_scale255,
bool is_sat>
1413 void mul_S16_U8_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1416 Window win = window;
1417 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1418 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
1422 input1_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1423 input2_win.set(
Window::DimX, Window::Dimension(0, 1, 1));
1425 Iterator input1(src1, input1_win);
1426 Iterator input2(src2, input2_win);
1427 Iterator
dst(out, win);
1429 const int window_step_x = 16;
1430 const auto window_start_x =
static_cast<int>(window.x().start());
1431 const auto window_end_x =
static_cast<int>(window.x().end());
1435 [&](
const Coordinates &)
1437 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.ptr());
1438 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.ptr());
1439 const auto output_ptr =
reinterpret_cast<int16_t *
>(
dst.ptr());
1442 int x = window_start_x;
1443 for (; x <= (window_end_x - window_step_x); x += window_step_x)
1445 const int16x8x2_t ta1 = {{
1446 vld1q_s16(input1_ptr + x),
1447 vld1q_s16(input1_ptr + x + 8),
1449 const uint8x8x2_t ta2u = {{
1450 vld1_u8(input2_ptr + x),
1451 vld1_u8(input2_ptr + x + 8),
1453 const int16x8x2_t ta2 = {
1454 {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
1456 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1458 vst1q_s16(output_ptr + x, result.val[0]);
1459 vst1q_s16(output_ptr + x + 8, result.val[1]);
1463 for (; x < window_end_x; ++x)
1465 int32_t tmp =
static_cast<int32_t
>(*(input1_ptr + x)) *
static_cast<int32_t
>(*(input2_ptr + x));
1469 float tmp_f =
static_cast<float>(tmp) * scale255_constant;
1471 tmp =
static_cast<int32_t
>(tmp_f + 0.5f);
1481 uint32_t mask = (1u << n) - 1;
1482 tmp = (tmp +
static_cast<int32_t
>(mask)) >> n;
1487 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1489 *(output_ptr + x) =
static_cast<int16_t
>(tmp);
1492 input1, input2,
dst);
1495 template <
bool is_scale255,
bool is_sat>
1496 void mul_U8_S16_S16(
const ITensor *src1,
const ITensor *src2, ITensor *out,
const Window &window,
int n)
1499 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
1521 _scale_exponent = 0;
1522 _func_quantized =
nullptr;
1523 _func_int =
nullptr;
1524 _func_float =
nullptr;
1526 bool is_scale_255 =
false;
1528 if (std::abs(
scale - scale255_constant) < 0.00001f)
1530 is_scale_255 =
true;
1536 std::frexp(
scale, &exponent);
1540 _scale_exponent = std::abs(exponent - 1);
1553 if (mul_q8_neon_fixedpoint_possible(src1, src2,
dst,
scale))
1555 _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
1559 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1566 if (mul_q8_neon_fixedpoint_possible(src1, src2,
dst,
scale))
1568 _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
1572 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1579 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1583 _func_int = &mul_QSYMM16_QSYMM16_S32;
1591 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1595 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1602 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1606 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1613 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1621 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1625 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1632 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1636 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1643 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1647 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1665 ICpuKernel::configure(win);
1672 #if defined(ENABLE_FP32_KERNELS)
1678 mws = default_mws_N1_fp32_neon;
1682 mws = default_mws_V1_fp32_neon;
1690 return default_mws_other_platforms_1d_tensor;
1706 return std::max(
static_cast<size_t>(1), mws);
1716 return default_mws_other_platforms_1d_tensor;
1744 if (_func_quantized !=
nullptr)
1746 (*_func_quantized)(src1, src2,
dst,
window, _scale);
1748 else if (_func_int !=
nullptr)
1750 (*_func_int)(src1, src2,
dst,
window, _scale_exponent);
1755 (*_func_float)(src1, src2,
dst,
window, _scale);
1761 return "CpuMulKernel";
1776 if (
dst->total_size() > 0)
1780 "Wrong shape for dst");
1801 ICpuKernel::configure(win);
1822 c_mul_F32_F32_F32_n(src1, src2,
dst,
window);
1827 return "CpuComplexMulKernel";