52 const auto window_start_x =
static_cast<int>(window.
x().
start());
53 const auto window_end_x =
static_cast<int>(window.
x().
end());
60 const auto vscale1 = svdup_n_f32(iq1_info.
scale);
61 const auto vscale2 = svdup_n_f32(iq2_info.
scale);
62 const auto invvscaleo = svdup_n_f32(1.f / oq_info.
scale);
63 const auto all_true_pg = svptrue_b16();
65 if (is_broadcast_across_x)
67 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
68 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
69 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
70 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
71 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
76 Iterator broadcast_input(broadcast_tensor, broadcast_win);
77 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
84 const auto non_broadcast_input_ptr =
reinterpret_cast<const int16_t *
>(non_broadcast_input.
ptr());
85 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.
ptr());
87 const int16_t broadcast_value = *
reinterpret_cast<const int16_t *
>(broadcast_input.
ptr());
88 const auto broadcast_value_vec = svdup_n_s16(broadcast_value);
90 int x = window_start_x;
91 svbool_t pg = svwhilelt_b16(x, window_end_x);
93 const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
94 const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
98 const auto a = svld1_s16(pg, non_broadcast_input_ptr + x);
99 const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
100 const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
102 const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
103 const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
105 const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
107 svst1_s16(pg, output_ptr + x, res);
110 pg = svwhilelt_b16(x, window_end_x);
111 }
while (svptest_any(all_true_pg, pg));
113 broadcast_input, non_broadcast_input, output);
129 const auto input1_ptr =
reinterpret_cast<const int16_t *
>(input1.
ptr());
130 const auto input2_ptr =
reinterpret_cast<const int16_t *
>(input2.
ptr());
131 const auto output_ptr =
reinterpret_cast<int16_t *
>(output.
ptr());
133 int x = window_start_x;
134 svbool_t pg = svwhilelt_b16(x, window_end_x);
137 auto a = svld1_s16(pg, input1_ptr + x);
138 auto b = svld1_s16(pg, input2_ptr + x);
140 const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
141 const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
143 const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(
b)), vscale2);
144 const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(
b)), vscale2);
146 const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
147 const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
149 const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
150 svst1_s16(pg, output_ptr + x, res);
153 pg = svwhilelt_b16(x, window_end_x);
154 }
while (svptest_any(all_true_pg, pg));
156 input1, input2, output);