52 const auto window_start_x =
static_cast<int>(window.
x().
start());
53 const auto window_end_x =
static_cast<int>(window.
x().
end());
55 const auto all_true_pg = svptrue_b8();
61 const auto invvscaleo = svdup_n_f32(1.f / oq_info.
scale);
62 const auto voffseto = svdup_n_f32(oq_info.
offset);
64 if (is_broadcast_across_x)
66 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
67 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
68 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
69 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
70 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
72 const svfloat32_t vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.
scale) : svdup_n_f32(iq2_info.
scale);
73 const svfloat32_t vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.
scale) : svdup_n_f32(iq1_info.
scale);
74 const svint32_t voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.
offset) : svdup_n_s32(iq2_info.
offset);
75 const svint32_t voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.
offset) : svdup_n_s32(iq1_info.
offset);
80 Iterator broadcast_input(broadcast_tensor, broadcast_win);
81 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
88 const auto non_broadcast_input_ptr =
reinterpret_cast<const uint8_t *
>(non_broadcast_input.
ptr());
89 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.
ptr());
91 const uint8_t broadcast_value = *
reinterpret_cast<const uint8_t *
>(broadcast_input.
ptr());
92 const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
94 int x = window_start_x;
95 svbool_t pg = svwhilelt_b8(x, window_end_x);
97 const auto bf_0 = svmul_f32_z(
100 pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))),
103 const auto bf_1 = svmul_f32_z(
106 pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))),
109 const auto bf_2 = svmul_f32_z(
112 pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))),
115 const auto bf_3 = svmul_f32_z(
118 pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))),
124 const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
126 const auto af_0 = svmul_f32_z(
129 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
131 const auto af_1 = svmul_f32_z(
134 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
136 const auto af_2 = svmul_f32_z(
139 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
141 const auto af_3 = svmul_f32_z(
144 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
148 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
150 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
152 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
154 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
156 const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
157 const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
159 const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
160 svst1_u8(pg, output_ptr + x, res);
163 pg = svwhilelt_b8(x, window_end_x);
164 }
while (svptest_any(all_true_pg, pg));
166 broadcast_input, non_broadcast_input, output);
178 const auto vscale1 = svdup_n_f32(iq1_info.
scale);
179 const auto vscale2 = svdup_n_f32(iq2_info.
scale);
180 const auto voffset1 = svdup_n_s32(iq1_info.
offset);
181 const auto voffset2 = svdup_n_s32(iq2_info.
offset);
187 const auto input1_ptr =
reinterpret_cast<const uint8_t *
>(input1.
ptr());
188 const auto input2_ptr =
reinterpret_cast<const uint8_t *
>(input2.
ptr());
189 const auto output_ptr =
reinterpret_cast<uint8_t *
>(output.
ptr());
191 int x = window_start_x;
192 svbool_t pg = svwhilelt_b8(x, window_end_x);
195 const auto a = svld1_u8(pg, input1_ptr + x);
196 const auto b = svld1_u8(pg, input2_ptr + x);
197 const auto af_0 = svmul_f32_z(
200 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
202 const auto af_1 = svmul_f32_z(
205 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
207 const auto af_2 = svmul_f32_z(
210 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
212 const auto af_3 = svmul_f32_z(
215 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
218 const auto bf_0 = svmul_f32_z(
221 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(
b))), voffset2)),
223 const auto bf_1 = svmul_f32_z(
226 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(
b))), voffset2)),
228 const auto bf_2 = svmul_f32_z(
231 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(
b))), voffset2)),
233 const auto bf_3 = svmul_f32_z(
236 svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(
b))), voffset2)),
240 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
242 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
244 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
246 svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
248 const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
249 const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
250 const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
252 svst1_u8(pg, output_ptr + x, res);
255 pg = svwhilelt_b8(x, window_end_x);
256 }
while (svptest_any(all_true_pg, pg));
258 input1, input2, output);