35 #include "interleaves/list.hpp"
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
39 #if defined(ARM_COMPUTE_ENABLE_SME2)
54 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
65 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
75 #endif // defined(__aarch64__)
82 #if defined(__aarch64__)
83 bool prefer_premultiply(
const DepthwiseArgs &
args) {
84 if ((
args.stride_rows !=
args.stride_cols) || (
args.kernel_rows !=
args.kernel_cols))
89 unsigned int threshold;
91 if (
args.stride_rows == 1 &&
args.kernel_rows == 3)
95 else if (
args.stride_rows == 1 &&
args.kernel_rows == 5)
99 else if (
args.stride_rows == 2 &&
args.kernel_rows == 3)
103 else if (
args.stride_rows == 2 &&
args.kernel_rows == 5)
111 return args.channel_multiplier <= threshold;
114 template <
class Strategy>
115 unsigned int cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
117 if (
args.channel_multiplier > 1 && !prefer_premultiply(
args))
119 return std::numeric_limits<unsigned int>::max();
126 (
long unsigned)
args.input_channels *
args.channel_multiplier,
127 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
131 template <
class Strategy>
132 unsigned int planar_cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
138 (
long unsigned)
args.input_channels *
args.channel_multiplier,
139 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
143 template <
class Strategy>
144 unsigned int fast_mode_cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
150 (
long unsigned)
args.input_channels *
args.channel_multiplier,
151 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
155 unsigned int multiplier_cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
157 return prefer_premultiply(
args)? std::numeric_limits<unsigned int>::max() : 0;
160 unsigned int not_preferred(
const DepthwiseArgs &,
const Nothing &)
162 return std::numeric_limits<unsigned int>::max();
165 bool fast_mode_enabled(
const DepthwiseArgs &
args,
const void *) __attribute__ ((unused));
166 bool fast_mode_enabled(
const DepthwiseArgs &
args,
const void *)
168 return args.fast_mode;
170 #endif // defined(__aarch64__)
173 static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
174 #if defined(__aarch64__)
175 #if defined(ARM_COMPUTE_ENABLE_SVE)
176 #if defined(ARM_COMPUTE_ENABLE_SME2)
178 DepthwiseMethod::PLANAR,
179 "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
180 constraint(fast_mode_enabled,
181 cpu_has_sme, cpu_has_sme2,
182 is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
183 has_no_channel_multiplier, no_prime_right_pad),
185 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
186 auto strat =
new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(
args.cpu_info);
187 return new DepthwisePlanar<float>(strat,
args);
191 DepthwiseMethod::PLANAR,
192 "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
193 constraint(fast_mode_enabled,
194 cpu_has_sme, cpu_has_sme2,
195 is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
196 has_no_channel_multiplier, no_prime_right_pad),
198 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
199 auto strat =
new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(
args.cpu_info);
200 return new DepthwisePlanar<float>(strat,
args);
204 DepthwiseMethod::PLANAR,
205 "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
206 constraint(fast_mode_enabled,
207 cpu_has_sme, cpu_has_sme2,
208 is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
209 has_no_channel_multiplier, no_prime_right_pad),
211 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
212 auto strat =
new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(
args.cpu_info);
213 return new DepthwisePlanar<float>(strat,
args);
217 DepthwiseMethod::PLANAR,
218 "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
219 constraint(fast_mode_enabled,
220 cpu_has_sme, cpu_has_sme2,
221 is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
222 has_no_channel_multiplier, no_prime_right_pad),
224 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
225 auto strat =
new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(
args.cpu_info);
226 return new DepthwisePlanar<float>(strat,
args);
231 DepthwiseMethod::PLANAR,
232 "sme2_fp32_planar_3x3_s1_4rows_mla_za",
233 constraint(cpu_has_sme, cpu_has_sme2,
234 is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
235 has_no_channel_multiplier, no_prime_right_pad),
236 [] (
const DepthwiseArgs &
args,
const Nothing &os) ->
unsigned int {
239 if (
args.input_rows *
args.input_cols <
args.input_channels)
242 return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(
args, os);
244 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
245 auto strat =
new sme2_fp32_planar_3x3_s1_4rows_mla_za(
args.cpu_info);
246 return new DepthwisePlanar<float>(strat,
args);
250 DepthwiseMethod::PLANAR,
251 "sme2_fp32_planar_3x3_s2_4rows_mla_za",
252 constraint(cpu_has_sme, cpu_has_sme2,
253 is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
254 has_no_channel_multiplier, no_prime_right_pad),
255 planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
256 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
257 auto strat =
new sme2_fp32_planar_3x3_s2_4rows_mla_za(
args.cpu_info);
258 return new DepthwisePlanar<float>(strat,
args);
262 DepthwiseMethod::PLANAR,
263 "sme2_fp32_planar_5x5_s1_4rows_mla_za",
264 constraint(cpu_has_sme, cpu_has_sme2,
265 is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
266 has_no_channel_multiplier, no_prime_right_pad),
268 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
269 auto strat =
new sme2_fp32_planar_5x5_s1_4rows_mla_za(
args.cpu_info);
270 return new DepthwisePlanar<float>(strat,
args);
274 DepthwiseMethod::PLANAR,
275 "sme2_fp32_planar_5x5_s2_4rows_mla_za",
276 constraint(cpu_has_sme, cpu_has_sme2,
277 is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
278 has_no_channel_multiplier, no_prime_right_pad),
280 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
281 auto strat =
new sme2_fp32_planar_5x5_s2_4rows_mla_za(
args.cpu_info);
282 return new DepthwisePlanar<float>(strat,
args);
287 DepthwiseMethod::DEPTHFIRST,
288 "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
289 constraint(cpu_has_sme, cpu_has_sme2,
290 is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
291 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
292 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
293 auto strat =
new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(
args.cpu_info);
294 return new DepthwiseDepthfirst<float, float, float, float>(strat,
args);
298 DepthwiseMethod::DEPTHFIRST,
299 "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
300 constraint(cpu_has_sme, cpu_has_sme2,
301 is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
302 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
303 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
304 auto strat =
new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(
args.cpu_info);
305 return new DepthwiseDepthfirst<float, float, float, float>(strat,
args);
309 DepthwiseMethod::DEPTHFIRST,
310 "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
311 constraint(cpu_has_sme, cpu_has_sme2,
312 is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
313 cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
314 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
315 auto strat =
new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
316 return new DepthwiseDepthfirst<float, float, float, float>(strat,
args);
320 DepthwiseMethod::DEPTHFIRST,
321 "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
322 constraint(cpu_has_sme, cpu_has_sme2,
323 is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
324 cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
325 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
326 auto strat =
new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
327 return new DepthwiseDepthfirst<float, float, float, float>(strat,
args);
330 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
332 DepthwiseMethod::DEPTHFIRST,
333 "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
334 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
336 cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
337 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
338 auto strat =
new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(
args.cpu_info);
339 return new DepthwiseDepthfirst<float>(strat,
args);
343 DepthwiseMethod::DEPTHFIRST,
344 "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
345 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
347 cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
348 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
349 auto strat =
new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(
args.cpu_info);
350 return new DepthwiseDepthfirst<float>(strat,
args);
354 DepthwiseMethod::DEPTHFIRST,
355 "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
356 constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
358 cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
359 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
360 auto strat =
new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
361 return new DepthwiseDepthfirst<float>(strat,
args);
365 DepthwiseMethod::DEPTHFIRST,
366 "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
367 constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
369 cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
370 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
371 auto strat =
new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
372 return new DepthwiseDepthfirst<float>(strat,
args);
376 DepthwiseMethod::DEPTHFIRST,
377 "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
378 constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
380 cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
381 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
382 auto strat =
new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
383 return new DepthwiseDepthfirst<float>(strat,
args);
387 DepthwiseMethod::DEPTHFIRST,
388 "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
389 constraint(cpu_has_sve),
391 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
392 auto kern =
new sve_fp32_nhwc_generic_output9_mla_depthfirst(
args.cpu_info);
393 auto strat =
new GenericDepthfirstStrategy<float>(kern, 3, 3,
args);
394 return new DepthwiseDepthfirstGeneric<float>(strat,
args);
398 DepthwiseMethod::DEPTHFIRST,
399 "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
400 constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
401 cpu_has_sve, has_channel_multiplier),
402 multiplier_cycle_estimate,
403 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
404 auto strat =
new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(
args.cpu_info);
405 return new DepthwiseDepthfirstMultiplier<float>(strat,
args);
409 DepthwiseMethod::DEPTHFIRST,
410 "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
411 constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
412 cpu_has_sve, has_channel_multiplier),
413 multiplier_cycle_estimate,
414 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
415 auto strat =
new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(
args.cpu_info);
416 return new DepthwiseDepthfirstMultiplier<float>(strat,
args);
420 DepthwiseMethod::DEPTHFIRST,
421 "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
422 constraint(cpu_has_sve, has_channel_multiplier),
423 multiplier_cycle_estimate,
424 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
425 auto kern =
new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(
args.cpu_info);
426 auto strat =
new GenericDepthfirstMultiplierStrategy<float>(kern,
args);
427 return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat,
args);
430 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
432 DepthwiseMethod::DEPTHFIRST,
433 "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
434 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
435 cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
436 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
437 auto strat =
new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(
args.cpu_info);
438 return new DepthwiseDepthfirst<float>(strat,
args);
442 DepthwiseMethod::DEPTHFIRST,
443 "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
444 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
445 cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
446 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
447 auto strat =
new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(
args.cpu_info);
448 return new DepthwiseDepthfirst<float>(strat,
args);
452 DepthwiseMethod::DEPTHFIRST,
453 "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
454 constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
455 cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
456 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
457 auto strat =
new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
458 return new DepthwiseDepthfirst<float>(strat,
args);
462 DepthwiseMethod::DEPTHFIRST,
463 "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
464 constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
465 cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
466 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
467 auto strat =
new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
468 return new DepthwiseDepthfirst<float>(strat,
args);
472 DepthwiseMethod::DEPTHFIRST,
473 "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
474 constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>),
475 cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
476 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
477 auto strat =
new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
478 return new DepthwiseDepthfirst<float>(strat,
args);
482 DepthwiseMethod::DEPTHFIRST,
483 "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
486 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
487 auto kern =
new a64_fp32_nhwc_generic_output9_mla_depthfirst(
args.cpu_info);
488 auto strat =
new GenericDepthfirstStrategy<float>(kern, 3, 3,
args);
489 return new DepthwiseDepthfirstGeneric<float>(strat,
args);
493 DepthwiseMethod::DEPTHFIRST,
494 "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
495 constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
496 has_channel_multiplier),
497 multiplier_cycle_estimate,
498 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
499 auto strat =
new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(
args.cpu_info);
500 return new DepthwiseDepthfirstMultiplier<float>(strat,
args);
504 DepthwiseMethod::DEPTHFIRST,
505 "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
506 constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
507 has_channel_multiplier),
508 multiplier_cycle_estimate,
509 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
510 auto strat =
new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(
args.cpu_info);
511 return new DepthwiseDepthfirstMultiplier<float>(strat,
args);
515 DepthwiseMethod::DEPTHFIRST,
516 "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
517 constraint(has_channel_multiplier),
518 multiplier_cycle_estimate,
519 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<float, float, float> * {
520 auto kern =
new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(
args.cpu_info);
521 auto strat =
new GenericDepthfirstMultiplierStrategy<float>(kern,
args);
522 return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat,
args);
525 #endif // defined(__aarch64__)
526 { DepthwiseMethod::DEFAULT,
"",
nullptr,
nullptr,
nullptr },
532 return depthwise_fp32_methods;
535 template UniqueDepthwiseCommon<float>
depthwise(
const DepthwiseArgs &,
const Nothing &);