49 const auto window_start_x =
static_cast<int>(window.
x().
start());
50 const auto window_end_x =
static_cast<int>(window.
x().
end());
57 const auto invvscaleo = svdup_n_f32(1.f / oq_info.
scale);
58 const auto voffseto = svdup_n_f32(oq_info.
offset);
60 if(is_broadcast_across_x)
62 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
63 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
64 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
65 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
66 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
67 const auto all_true_pg = svptrue_b8();
69 const auto vscale1 = is_broadcast_input_2 ? svdup_n_f32(iq1_info.
scale) : svdup_n_f32(iq2_info.
scale);
70 const auto vscale2 = is_broadcast_input_2 ? svdup_n_f32(iq2_info.
scale) : svdup_n_f32(iq1_info.
scale);
71 const auto voffset1 = is_broadcast_input_2 ? svdup_n_s32(iq1_info.
offset) : svdup_n_s32(iq2_info.
offset);
72 const auto voffset2 = is_broadcast_input_2 ? svdup_n_s32(iq2_info.
offset) : svdup_n_s32(iq1_info.
offset);
77 Iterator broadcast_input(broadcast_tensor, broadcast_win);
78 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
83 const auto non_broadcast_input_ptr =
reinterpret_cast<const int8_t *
>(non_broadcast_input.
ptr());
84 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.
ptr());
86 const int8_t broadcast_value = *
reinterpret_cast<const int8_t *
>(broadcast_input.
ptr());
87 const auto broadcast_value_vec = svdup_n_s8(broadcast_value);
89 int x = window_start_x;
90 svbool_t pg = svwhilelt_b8(x, window_end_x);
91 const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
92 const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
93 const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
94 const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
98 const auto a = svld1_s8(pg, non_broadcast_input_ptr + x);
99 const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
100 const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
101 const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
102 const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
104 const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
105 const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
106 const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
107 const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
109 const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
110 const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
111 const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
113 svst1_s8(pg, output_ptr + x, res);
116 pg = svwhilelt_b8(x, window_end_x);
118 while(svptest_any(all_true_pg, pg));
120 broadcast_input, non_broadcast_input, output);
132 const auto vscale1 = svdup_n_f32(iq1_info.
scale);
133 const auto vscale2 = svdup_n_f32(iq2_info.
scale);
134 const auto voffset1 = svdup_n_s32(iq1_info.
offset);
135 const auto voffset2 = svdup_n_s32(iq2_info.
offset);
139 const auto input1_ptr =
reinterpret_cast<const int8_t *
>(input1.
ptr());
140 const auto input2_ptr =
reinterpret_cast<const int8_t *
>(input2.
ptr());
141 const auto output_ptr =
reinterpret_cast<int8_t *
>(output.
ptr());
143 int x = window_start_x;
144 svbool_t pg = svwhilelt_b8(x, window_end_x);
147 const auto a = svld1_s8(pg, input1_ptr + x);
148 const auto b = svld1_s8(pg, input2_ptr + x);
150 const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
151 const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
152 const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
153 const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
155 const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(
b)), voffset2)), vscale2);
156 const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(
b)), voffset2)), vscale2);
157 const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(
b)), voffset2)), vscale2);
158 const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(
b)), voffset2)), vscale2);
160 const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
161 const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
162 const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
163 const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
165 const auto pa = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
166 const auto pb = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
167 const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
169 svst1_s8(pg, output_ptr + x, res);
172 pg = svwhilelt_b8(x, window_end_x);
174 while(svptest_any(svptrue_b8(), pg));
176 input1, input2, output);