37 template <
typename ScalarType>
43 const auto all_true_pg = svptrue<ScalarType>();
53 const auto window_start_x =
static_cast<int>(window.
x().
start());
54 const auto window_end_x =
static_cast<int>(window.
x().
end());
57 if (is_broadcast_across_x)
59 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
60 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
61 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
62 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
63 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
68 Iterator broadcast_input(broadcast_tensor, broadcast_win);
69 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
76 auto output_ptr =
reinterpret_cast<ScalarType *
>(output.
ptr());
77 const auto non_broadcast_input_ptr =
reinterpret_cast<const ScalarType *
>(non_broadcast_input.
ptr());
78 const ScalarType broadcast_value = *
reinterpret_cast<const ScalarType *
>(broadcast_input.
ptr());
79 const auto broadcast_vector = svdup_n(broadcast_value);
81 int x = window_start_x;
83 svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
86 const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
89 if (is_broadcast_input_2)
92 broadcast_vector, op);
97 pg, broadcast_vector, non_broadcast_vector, op);
99 svst1(pg, output_ptr + x, res);
101 x += svcnt<ScalarType>();
102 pg = svwhilelt<ScalarType>(x, window_end_x);
103 }
while (svptest_any(all_true_pg, pg));
105 broadcast_input, non_broadcast_input, output);
121 auto output_ptr =
reinterpret_cast<ScalarType *
>(output.
ptr());
122 const auto input1_ptr =
reinterpret_cast<const ScalarType *
>(input1.
ptr());
123 const auto input2_ptr =
reinterpret_cast<const ScalarType *
>(input2.
ptr());
125 int x = window_start_x;
127 svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
130 const auto in1 = svld1(pg, input1_ptr + x);
131 const auto in2 = svld1(pg, input2_ptr + x);
133 svst1(pg, output_ptr + x, res);
135 x += svcnt<ScalarType>();
136 pg = svwhilelt<ScalarType>(x, window_end_x);
137 }
while (svptest_any(all_true_pg, pg));
139 input1, input2, output);
151 template <
typename InputScalarType,
typename OutputScalarType>
155 static_assert(
sizeof(InputScalarType) >=
sizeof(OutputScalarType),
156 "input data type's width should be equal to or greater than output data type's width");
159 const auto all_true_pg = svptrue<InputScalarType>();
169 const auto window_start_x =
static_cast<int>(window.
x().
start());
170 const auto window_end_x =
static_cast<int>(window.
x().
end());
173 if (is_broadcast_across_x)
175 const bool is_broadcast_input_2 = input2_win.
x().
step() == 0;
176 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
177 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
178 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
179 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
184 Iterator broadcast_input(broadcast_tensor, broadcast_win);
185 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
192 auto output_ptr =
reinterpret_cast<OutputScalarType *
>(output.
ptr());
193 const auto non_broadcast_input_ptr =
194 reinterpret_cast<const InputScalarType *
>(non_broadcast_input.
ptr());
195 const InputScalarType broadcast_value =
196 *
reinterpret_cast<const InputScalarType *
>(broadcast_input.
ptr());
197 const auto broadcast_vector = svdup_n(broadcast_value);
199 int x = window_start_x;
201 svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
204 const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
205 const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
206 OutputVectorType res{};
207 if (is_broadcast_input_2)
211 pg, non_broadcast_vector, broadcast_vector, op);
217 pg, broadcast_vector, non_broadcast_vector, op);
219 svst1(output_pg, output_ptr + x, res);
221 x += svcnt<InputScalarType>();
222 pg = svwhilelt<InputScalarType>(x, window_end_x);
223 }
while (svptest_any(all_true_pg, pg));
225 broadcast_input, non_broadcast_input, output);
241 auto output_ptr =
reinterpret_cast<OutputScalarType *
>(output.
ptr());
242 const auto input1_ptr =
reinterpret_cast<const InputScalarType *
>(input1.
ptr());
243 const auto input2_ptr =
reinterpret_cast<const InputScalarType *
>(input2.
ptr());
245 int x = window_start_x;
247 svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
250 const auto in1 = svld1(pg, input1_ptr + x);
251 const auto in2 = svld1(pg, input2_ptr + x);
255 const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
256 svst1(output_pg, output_ptr + x, res);
258 x += svcnt<InputScalarType>();
259 pg = svwhilelt<InputScalarType>(x, window_end_x);
260 }
while (svptest_any(all_true_pg, pg));
262 input1, input2, output);
280 return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg,
b)));
286 return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg,
b)));