52 const auto window_start_x =
static_cast<int>(window.
x().
start());
53 const auto window_end_x =
static_cast<int>(window.
x().
end());
60 const auto invvscaleo = svdup_n_f32(1.f / oq_info.
scale);
61 const auto voffseto = svdup_n_f32(oq_info.
offset);
63 if (is_broadcast_across_x)
65 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
66 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
67 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
68 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
69 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
70 const auto all_true_pg = svptrue_b8();
72 const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.
scale) : svdup_n_f32(iq2_info.
scale);
73 const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.
scale) : svdup_n_f32(iq1_info.
scale);
74 const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.
offset) : svdup_n_s32(iq2_info.
offset);
75 const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.
offset) : svdup_n_s32(iq1_info.
offset);
80 Iterator broadcast_input(broadcast_tensor, broadcast_win);
81 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
88 const auto non_broadcast_input_ptr =
reinterpret_cast<const int8_t *
>(non_broadcast_input.
ptr());
89 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.
ptr());
91 const int8_t broadcast_value = *
reinterpret_cast<const int8_t *
>(broadcast_input.
ptr());
92 const auto broadcast_value_vec = svdup_n_s8(broadcast_value);
94 int x = window_start_x;
95 svbool_t pg = svwhilelt_b8(x, window_end_x);
96 const auto bf_0 = svmul_f32_z(
97 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
99 const auto bf_1 = svmul_f32_z(
100 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
102 const auto bf_2 = svmul_f32_z(
103 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
105 const auto bf_3 = svmul_f32_z(
106 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
111 const auto a = svld1_s8(pg, non_broadcast_input_ptr + x);
112 const auto af_0 = svmul_f32_z(
113 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
114 const auto af_1 = svmul_f32_z(
115 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
116 const auto af_2 = svmul_f32_z(
117 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
118 const auto af_3 = svmul_f32_z(
119 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
122 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
124 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
126 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
128 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
130 const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
131 const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
132 const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
134 svst1_s8(pg, output_ptr + x, res);
137 pg = svwhilelt_b8(x, window_end_x);
138 }
while (svptest_any(all_true_pg, pg));
140 broadcast_input, non_broadcast_input, output);
152 const auto vscale1 = svdup_n_f32(iq1_info.
scale);
153 const auto vscale2 = svdup_n_f32(iq2_info.
scale);
154 const auto voffset1 = svdup_n_s32(iq1_info.
offset);
155 const auto voffset2 = svdup_n_s32(iq2_info.
offset);
161 const auto input1_ptr =
reinterpret_cast<const int8_t *
>(input1.
ptr());
162 const auto input2_ptr =
reinterpret_cast<const int8_t *
>(input2.
ptr());
163 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.
ptr());
165 int x = window_start_x;
166 svbool_t pg = svwhilelt_b8(x, window_end_x);
169 const auto a = svld1_s8(pg, input1_ptr + x);
170 const auto b = svld1_s8(pg, input2_ptr + x);
172 const auto af_0 = svmul_f32_z(
173 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
174 const auto af_1 = svmul_f32_z(
175 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
176 const auto af_2 = svmul_f32_z(
177 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
178 const auto af_3 = svmul_f32_z(
179 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
181 const auto bf_0 = svmul_f32_z(
182 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(
b)), voffset2)), vscale2);
183 const auto bf_1 = svmul_f32_z(
184 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(
b)), voffset2)), vscale2);
185 const auto bf_2 = svmul_f32_z(
186 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(
b)), voffset2)), vscale2);
187 const auto bf_3 = svmul_f32_z(
188 pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(
b)), voffset2)), vscale2);
191 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
193 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
195 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
197 svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
199 const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
200 const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
201 const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
203 svst1_s8(pg, output_ptr + x, res);
206 pg = svwhilelt_b8(x, window_end_x);
207 }
while (svptest_any(svptrue_b8(), pg));
209 input1, input2, output);