Compute Library
 23.08
depthwise_fp32.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2023 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
28 #include "depthwise_depthfirst.hpp"
31 #include "depthwise_planar.hpp"
32 
34 
35 #include "interleaves/list.hpp"
36 
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
39 #if defined(ARM_COMPUTE_ENABLE_SME2)
44 
49 
54 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
55 
65 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
75 #endif // defined(__aarch64__)
76 
77 namespace arm_conv {
78 namespace depthwise {
79 
80 namespace
81 {
82 #if defined(__aarch64__)
83  bool prefer_premultiply(const DepthwiseArgs &args) {
84  if ((args.stride_rows != args.stride_cols) || (args.kernel_rows != args.kernel_cols))
85  {
86  return false;
87  }
88 
89  unsigned int threshold;
90 
91  if (args.stride_rows == 1 && args.kernel_rows == 3)
92  {
93  threshold = 18;
94  }
95  else if (args.stride_rows == 1 && args.kernel_rows == 5)
96  {
97  threshold = 5;
98  }
99  else if (args.stride_rows == 2 && args.kernel_rows == 3)
100  {
101  threshold = 5;
102  }
103  else if (args.stride_rows == 2 && args.kernel_rows == 5)
104  {
105  threshold = 12;
106  } else
107  {
108  return false;
109  }
110 
111  return args.channel_multiplier <= threshold;
112  }
113 
114  template <class Strategy>
115  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
116  {
117  if (args.channel_multiplier > 1 && !prefer_premultiply(args))
118  {
119  return std::numeric_limits<unsigned int>::max();
120  }
121 
122  // First-pass: compute the number of output pixels which will be computed.
123  return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
124  arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
126  (long unsigned) args.input_channels * args.channel_multiplier,
127  arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
128  );
129  }
130 
131  template <class Strategy>
132  unsigned int planar_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
133  {
134  // First-pass: compute the number of output pixels which will be computed.
135  return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
136  args.output_cols *
138  (long unsigned) args.input_channels * args.channel_multiplier,
139  arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
140  );
141  }
142 
143  template <class Strategy>
144  unsigned int fast_mode_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
145  {
146  // First-pass: compute the number of output pixels which will be computed.
147  return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
148  arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
150  (long unsigned) args.input_channels * args.channel_multiplier,
151  arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
152  ) * 2 / 3;
153  }
154 
155  unsigned int multiplier_cycle_estimate(const DepthwiseArgs &args, const Nothing &)
156  {
157  return prefer_premultiply(args)? std::numeric_limits<unsigned int>::max() : 0;
158  }
159 
160  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
161  {
162  return std::numeric_limits<unsigned int>::max();
163  }
164 
165  bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
166  bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
167  {
168  return args.fast_mode;
169  }
170 #endif // defined(__aarch64__)
171 }
172 
173 static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
174 #if defined(__aarch64__)
175 #if defined(ARM_COMPUTE_ENABLE_SVE)
176 #if defined(ARM_COMPUTE_ENABLE_SME2)
177  {
178  DepthwiseMethod::PLANAR,
179  "sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za",
180  constraint(fast_mode_enabled,
181  cpu_has_sme, cpu_has_sme2,
182  is_supported<sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za>,
183  has_no_channel_multiplier, no_prime_right_pad),
184  nullptr,
185  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
186  auto strat = new sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za(args.cpu_info);
187  return new DepthwisePlanar<float>(strat, args);
188  },
189  },
190  {
191  DepthwiseMethod::PLANAR,
192  "sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za",
193  constraint(fast_mode_enabled,
194  cpu_has_sme, cpu_has_sme2,
195  is_supported<sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za>,
196  has_no_channel_multiplier, no_prime_right_pad),
197  nullptr,
198  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
199  auto strat = new sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za(args.cpu_info);
200  return new DepthwisePlanar<float>(strat, args);
201  },
202  },
203  {
204  DepthwiseMethod::PLANAR,
205  "sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za",
206  constraint(fast_mode_enabled,
207  cpu_has_sme, cpu_has_sme2,
208  is_supported<sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za>,
209  has_no_channel_multiplier, no_prime_right_pad),
210  nullptr,
211  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
212  auto strat = new sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za(args.cpu_info);
213  return new DepthwisePlanar<float>(strat, args);
214  },
215  },
216  {
217  DepthwiseMethod::PLANAR,
218  "sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za",
219  constraint(fast_mode_enabled,
220  cpu_has_sme, cpu_has_sme2,
221  is_supported<sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za>,
222  has_no_channel_multiplier, no_prime_right_pad),
223  nullptr,
224  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
225  auto strat = new sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za(args.cpu_info);
226  return new DepthwisePlanar<float>(strat, args);
227  },
228  },
229 
230  {
231  DepthwiseMethod::PLANAR,
232  "sme2_fp32_planar_3x3_s1_4rows_mla_za",
233  constraint(cpu_has_sme, cpu_has_sme2,
234  is_supported<sme2_fp32_planar_3x3_s1_4rows_mla_za>,
235  has_no_channel_multiplier, no_prime_right_pad),
236  [] (const DepthwiseArgs &args, const Nothing &os) -> unsigned int {
237  // Heuristic, don't prefer this kernel unless the input plane is greater
238  // than the number of channels.
239  if (args.input_rows * args.input_cols < args.input_channels)
240  return UINT32_MAX;
241 
242  return planar_cycle_estimate<sme2_fp32_planar_3x3_s1_4rows_mla_za>(args, os);
243  },
244  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
245  auto strat = new sme2_fp32_planar_3x3_s1_4rows_mla_za(args.cpu_info);
246  return new DepthwisePlanar<float>(strat, args);
247  },
248  },
249  {
250  DepthwiseMethod::PLANAR,
251  "sme2_fp32_planar_3x3_s2_4rows_mla_za",
252  constraint(cpu_has_sme, cpu_has_sme2,
253  is_supported<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
254  has_no_channel_multiplier, no_prime_right_pad),
255  planar_cycle_estimate<sme2_fp32_planar_3x3_s2_4rows_mla_za>,
256  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
257  auto strat = new sme2_fp32_planar_3x3_s2_4rows_mla_za(args.cpu_info);
258  return new DepthwisePlanar<float>(strat, args);
259  },
260  },
261  {
262  DepthwiseMethod::PLANAR,
263  "sme2_fp32_planar_5x5_s1_4rows_mla_za",
264  constraint(cpu_has_sme, cpu_has_sme2,
265  is_supported<sme2_fp32_planar_5x5_s1_4rows_mla_za>,
266  has_no_channel_multiplier, no_prime_right_pad),
267  nullptr,
268  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
269  auto strat = new sme2_fp32_planar_5x5_s1_4rows_mla_za(args.cpu_info);
270  return new DepthwisePlanar<float>(strat, args);
271  },
272  },
273  {
274  DepthwiseMethod::PLANAR,
275  "sme2_fp32_planar_5x5_s2_4rows_mla_za",
276  constraint(cpu_has_sme, cpu_has_sme2,
277  is_supported<sme2_fp32_planar_5x5_s2_4rows_mla_za>,
278  has_no_channel_multiplier, no_prime_right_pad),
279  nullptr,
280  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
281  auto strat = new sme2_fp32_planar_5x5_s2_4rows_mla_za(args.cpu_info);
282  return new DepthwisePlanar<float>(strat, args);
283  },
284  },
285 
286  {
287  DepthwiseMethod::DEPTHFIRST,
288  "sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
289  constraint(cpu_has_sme, cpu_has_sme2,
290  is_supported<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
291  cycle_estimate<sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
292  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
293  auto strat = new sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
294  return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
295  },
296  },
297  {
298  DepthwiseMethod::DEPTHFIRST,
299  "sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
300  constraint(cpu_has_sme, cpu_has_sme2,
301  is_supported<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
302  cycle_estimate<sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
303  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
304  auto strat = new sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
305  return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
306  },
307  },
308  {
309  DepthwiseMethod::DEPTHFIRST,
310  "sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
311  constraint(cpu_has_sme, cpu_has_sme2,
312  is_supported<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
313  cycle_estimate<sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
314  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
315  auto strat = new sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
316  return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
317  },
318  },
319  {
320  DepthwiseMethod::DEPTHFIRST,
321  "sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
322  constraint(cpu_has_sme, cpu_has_sme2,
323  is_supported<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
324  cycle_estimate<sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
325  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
326  auto strat = new sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
327  return new DepthwiseDepthfirst<float, float, float, float>(strat, args);
328  },
329  },
330 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
331  {
332  DepthwiseMethod::DEPTHFIRST,
333  "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
334  constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
335  cpu_has_sve),
336  cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
337  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
338  auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
339  return new DepthwiseDepthfirst<float>(strat, args);
340  },
341  },
342  {
343  DepthwiseMethod::DEPTHFIRST,
344  "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
345  constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
346  cpu_has_sve),
347  cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
348  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
349  auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
350  return new DepthwiseDepthfirst<float>(strat, args);
351  },
352  },
353  {
354  DepthwiseMethod::DEPTHFIRST,
355  "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
356  constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
357  cpu_has_sve),
358  cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
359  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
360  auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
361  return new DepthwiseDepthfirst<float>(strat, args);
362  },
363  },
364  {
365  DepthwiseMethod::DEPTHFIRST,
366  "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
367  constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
368  cpu_has_sve),
369  cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
370  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
371  auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
372  return new DepthwiseDepthfirst<float>(strat, args);
373  },
374  },
375  {
376  DepthwiseMethod::DEPTHFIRST,
377  "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
378  constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
379  cpu_has_sve),
380  cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
381  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
382  auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
383  return new DepthwiseDepthfirst<float>(strat, args);
384  },
385  },
386  {
387  DepthwiseMethod::DEPTHFIRST,
388  "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
389  constraint(cpu_has_sve),
390  not_preferred,
391  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
392  auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
393  auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
394  return new DepthwiseDepthfirstGeneric<float>(strat, args);
395  },
396  },
397  {
398  DepthwiseMethod::DEPTHFIRST,
399  "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
400  constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
401  cpu_has_sve, has_channel_multiplier),
402  multiplier_cycle_estimate,
403  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
404  auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
405  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
406  },
407  },
408  {
409  DepthwiseMethod::DEPTHFIRST,
410  "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
411  constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
412  cpu_has_sve, has_channel_multiplier),
413  multiplier_cycle_estimate,
414  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
415  auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
416  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
417  },
418  },
419  {
420  DepthwiseMethod::DEPTHFIRST,
421  "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
422  constraint(cpu_has_sve, has_channel_multiplier),
423  multiplier_cycle_estimate,
424  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
425  auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
426  auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
427  return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
428  },
429  },
430 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
431  {
432  DepthwiseMethod::DEPTHFIRST,
433  "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
434  constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>),
435  cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
436  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
437  auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
438  return new DepthwiseDepthfirst<float>(strat, args);
439  },
440  },
441  {
442  DepthwiseMethod::DEPTHFIRST,
443  "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
444  constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>),
445  cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
446  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
447  auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
448  return new DepthwiseDepthfirst<float>(strat, args);
449  },
450  },
451  {
452  DepthwiseMethod::DEPTHFIRST,
453  "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
454  constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>),
455  cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
456  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
457  auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
458  return new DepthwiseDepthfirst<float>(strat, args);
459  },
460  },
461  {
462  DepthwiseMethod::DEPTHFIRST,
463  "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
464  constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>),
465  cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
466  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
467  auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
468  return new DepthwiseDepthfirst<float>(strat, args);
469  },
470  },
471  {
472  DepthwiseMethod::DEPTHFIRST,
473  "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
474  constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>),
475  cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
476  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
477  auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
478  return new DepthwiseDepthfirst<float>(strat, args);
479  },
480  },
481  {
482  DepthwiseMethod::DEPTHFIRST,
483  "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
484  nullptr,
485  not_preferred,
486  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
487  auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
488  auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
489  return new DepthwiseDepthfirstGeneric<float>(strat, args);
490  },
491  },
492  {
493  DepthwiseMethod::DEPTHFIRST,
494  "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
495  constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
496  has_channel_multiplier),
497  multiplier_cycle_estimate,
498  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
499  auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
500  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
501  },
502  },
503  {
504  DepthwiseMethod::DEPTHFIRST,
505  "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
506  constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
507  has_channel_multiplier),
508  multiplier_cycle_estimate,
509  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
510  auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
511  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
512  },
513  },
514  {
515  DepthwiseMethod::DEPTHFIRST,
516  "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
517  constraint(has_channel_multiplier),
518  multiplier_cycle_estimate,
519  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
520  auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
521  auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
522  return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
523  },
524  },
525 #endif // defined(__aarch64__)
526  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
527 };
528 
529 template <>
531 {
532  return depthwise_fp32_methods;
533 }
534 
535 template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
536 template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
537 
538 } // namespace depthwise
539 } // namespace arm_conv
GemmTuner.args
args
Definition: GemmTuner.py:679
sme2_fp32bf16fp32_planar_5x5_s1_4rows_dot_za.hpp
arm_conv::depthwise::depthwise
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
sve_fp32_nhwc_generic_output9_mla_depthfirst.hpp
depthwise_depthfirst_generic.hpp
sme2_fp32bf16fp32_planar_3x3_s1_4rows_dot_za.hpp
sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst.hpp
sme2_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
sme2_fp32_planar_3x3_s2_4rows_mla_za.hpp
arm_gemm::roundup
T roundup(const T a, const T b)
Definition: utils.hpp:70
depthwise_depthfirst_multiplier.hpp
arm_gemm_local.hpp
arm_gemm::Nothing
Definition: arm_gemm.hpp:211
depthwise_depthfirst.hpp
sme2_fp32bf16fp32_planar_5x5_s2_4rows_dot_za.hpp
sme2_fp32bf16fp32_planar_3x3_s2_4rows_dot_za.hpp
depthwise_planar.hpp
sme2_fp32_planar_5x5_s2_4rows_mla_za.hpp
sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
depthwise_implementation.hpp
sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
arm_conv::depthwise::DepthwiseImplementation
Definition: depthwise_implementation.hpp:38
a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
a64_fp32_nhwc_generic_output9_mla_depthfirst.hpp
a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst.hpp
sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
arm_conv::depthwise::get_compatible_kernels< float >
template std::vector< KernelDescription > get_compatible_kernels< float >(const DepthwiseArgs &, const Nothing &)
arm_gemm::iceildiv
T iceildiv(const T a, const T b)
Definition: utils.hpp:65
a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst.hpp
arm_conv::depthwise::depthwise_implementation_list
const DepthwiseImplementation< float > * depthwise_implementation_list()
Definition: depthwise_fp32.cpp:530
a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
sme2_fp32_planar_3x3_s1_4rows_mla_za.hpp
sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst.hpp
arm_conv
Definition: addressing.cpp:30
sme2_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst.hpp
sme2_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst.hpp
sme2_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst.hpp
sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst.hpp
depthwise_implementation_constraints.hpp
sme2_fp32_planar_5x5_s1_4rows_mla_za.hpp