49 const int window_step_x = 8;
50 const auto window_start_x =
static_cast<int>(window.
x().
start());
51 const auto window_end_x =
static_cast<int>(window.
x().
end());
58 const float32x4_t vscale1 = vdupq_n_f32(iq1_info.
scale);
59 const float32x4_t vscale2 = vdupq_n_f32(iq2_info.
scale);
60 const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.
scale);
62 if (is_broadcast_across_x)
64 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
65 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
66 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
67 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
68 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
75 Iterator broadcast_input(broadcast_tensor, broadcast_win);
76 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
83 const auto non_broadcast_input_ptr =
reinterpret_cast<const int16_t *
>(non_broadcast_input.
ptr());
84 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.
ptr());
86 const int16_t broadcast_value = *
reinterpret_cast<const int16_t *
>(broadcast_input.
ptr());
87 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
89 const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
90 const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
91 const float bfs =
static_cast<int32_t
>(broadcast_value) * broadcast_qinfo.
scale;
94 int x = window_start_x;
95 for (; x <= (window_end_x - window_step_x); x += window_step_x)
97 const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);
98 const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
99 const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
104 rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
105 rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
107 rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
108 rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
111 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
112 vst1q_s16(output_ptr + x, pa);
116 for (; x < window_end_x; ++x)
118 const float afs =
static_cast<int32_t
>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.
scale;
122 broadcast_input, non_broadcast_input, output);
138 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.
ptr());
139 const auto input2_ptr =
reinterpret_cast<const int16_t *
>(input2.
ptr());
140 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.
ptr());
143 int x = window_start_x;
144 for (; x <= (window_end_x - window_step_x); x += window_step_x)
146 const int16x8_t a = vld1q_s16(input1_ptr + x);
147 const int16x8_t
b = vld1q_s16(input2_ptr + x);
149 const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
150 const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
151 const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(
b))), vscale2);
152 const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(
b))), vscale2);
157 rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
158 rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
160 rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
161 rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
164 const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
165 vst1q_s16(output_ptr + x, pa);
169 for (; x < window_end_x; ++x)
171 const float afs =
static_cast<int32_t
>((*(input1_ptr + x))) * iq1_info.
scale;
172 const float bfs =
static_cast<int32_t
>((*(input2_ptr + x))) * iq2_info.
scale;
176 input1, input2, output);