Compute Library
 22.08
depthwise_fp32.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
28 #include "depthwise_depthfirst.hpp"
31 #include "depthwise_planar.hpp"
32 
34 
35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
46 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
56 #endif // defined(__aarch64__)
57 
58 namespace arm_conv {
59 namespace depthwise {
60 
61 namespace
62 {
63  template <class Strategy>
64  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
65  {
66  // First-pass: compute the number of output pixels which will be computed.
67  return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
68  arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
70  (long unsigned) args.input_channels * args.channel_multiplier,
71  arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
72  );
73  }
74 
75 #if defined(__aarch64__)
76  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
77  {
78  return std::numeric_limits<unsigned int>::max();
79  }
80 
81  bool fast_mode_enabled(const DepthwiseArgs &args, const void *) __attribute__ ((unused));
82  bool fast_mode_enabled(const DepthwiseArgs &args, const void *)
83  {
84  return args.fast_mode;
85  }
86 #endif // defined(__aarch64__)
87 }
88 
89 static const DepthwiseImplementation<float, float> depthwise_fp32_methods[] = {
90 #if defined(__aarch64__)
91 #if defined(ARM_COMPUTE_ENABLE_SVE)
92  {
93  DepthwiseMethod::DEPTHFIRST,
94  "sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
95  constraint(is_supported<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
96  has_no_channel_multiplier,
97  cpu_has_sve),
98  cycle_estimate<sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
99  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
100  auto strat = new sve_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
101  return new DepthwiseDepthfirst<float>(strat, args);
102  },
103  },
104  {
105  DepthwiseMethod::DEPTHFIRST,
106  "sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
107  constraint(is_supported<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
108  has_no_channel_multiplier,
109  cpu_has_sve),
110  cycle_estimate<sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
111  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
112  auto strat = new sve_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
113  return new DepthwiseDepthfirst<float>(strat, args);
114  },
115  },
116  {
117  DepthwiseMethod::DEPTHFIRST,
118  "sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
119  constraint(is_supported<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
120  has_no_channel_multiplier,
121  cpu_has_sve),
122  cycle_estimate<sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
123  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
124  auto strat = new sve_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
125  return new DepthwiseDepthfirst<float>(strat, args);
126  },
127  },
128  {
129  DepthwiseMethod::DEPTHFIRST,
130  "sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
131  constraint(is_supported<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
132  has_no_channel_multiplier,
133  cpu_has_sve),
134  cycle_estimate<sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
135  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
136  auto strat = new sve_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
137  return new DepthwiseDepthfirst<float>(strat, args);
138  },
139  },
140  {
141  DepthwiseMethod::DEPTHFIRST,
142  "sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
143  constraint(is_supported<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
144  has_no_channel_multiplier,
145  cpu_has_sve),
146  cycle_estimate<sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
147  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
148  auto strat = new sve_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
149  return new DepthwiseDepthfirst<float>(strat, args);
150  },
151  },
152  {
153  DepthwiseMethod::DEPTHFIRST,
154  "sve_fp32_nhwc_generic_output3x3_mla_depthfirst",
155  constraint(has_no_channel_multiplier, cpu_has_sve),
156  not_preferred,
157  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
158  auto kern = new sve_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
159  auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
160  return new DepthwiseDepthfirstGeneric<float>(strat, args);
161  },
162  },
163  {
164  DepthwiseMethod::DEPTHFIRST,
165  "sve_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
166  constraint(is_supported<sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
167  cpu_has_sve, has_channel_multiplier),
168  nullptr,
169  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
170  auto strat = new sve_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
171  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
172  },
173  },
174  {
175  DepthwiseMethod::DEPTHFIRST,
176  "sve_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
177  constraint(is_supported<sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
178  cpu_has_sve, has_channel_multiplier),
179  nullptr,
180  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
181  auto strat = new sve_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
182  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
183  },
184  },
185  {
186  DepthwiseMethod::DEPTHFIRST,
187  "sve_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
188  constraint(cpu_has_sve, has_channel_multiplier),
189  nullptr,
190  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
191  auto kern = new sve_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
192  auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
193  return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
194  },
195  },
196 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
197  {
198  DepthwiseMethod::DEPTHFIRST,
199  "a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst",
200  constraint(is_supported<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
201  has_no_channel_multiplier),
202  cycle_estimate<a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst>,
203  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
204  auto strat = new a64_fp32_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
205  return new DepthwiseDepthfirst<float>(strat, args);
206  },
207  },
208  {
209  DepthwiseMethod::DEPTHFIRST,
210  "a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst",
211  constraint(is_supported<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
212  has_no_channel_multiplier),
213  cycle_estimate<a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst>,
214  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
215  auto strat = new a64_fp32_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
216  return new DepthwiseDepthfirst<float>(strat, args);
217  },
218  },
219  {
220  DepthwiseMethod::DEPTHFIRST,
221  "a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst",
222  constraint(is_supported<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
223  has_no_channel_multiplier),
224  cycle_estimate<a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst>,
225  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
226  auto strat = new a64_fp32_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
227  return new DepthwiseDepthfirst<float>(strat, args);
228  },
229  },
230  {
231  DepthwiseMethod::DEPTHFIRST,
232  "a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst",
233  constraint(is_supported<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
234  has_no_channel_multiplier),
235  cycle_estimate<a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst>,
236  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
237  auto strat = new a64_fp32_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
238  return new DepthwiseDepthfirst<float>(strat, args);
239  },
240  },
241  {
242  DepthwiseMethod::DEPTHFIRST,
243  "a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst",
244  constraint(is_supported<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
245  has_no_channel_multiplier),
246  cycle_estimate<a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst>,
247  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
248  auto strat = new a64_fp32_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
249  return new DepthwiseDepthfirst<float>(strat, args);
250  },
251  },
252  {
253  DepthwiseMethod::DEPTHFIRST,
254  "a64_fp32_nhwc_generic_output3x3_mla_depthfirst",
255  constraint(has_no_channel_multiplier),
256  not_preferred,
257  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
258  auto kern = new a64_fp32_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
259  auto strat = new GenericDepthfirstStrategy<float>(kern, 3, 3, args);
260  return new DepthwiseDepthfirstGeneric<float>(strat, args);
261  },
262  },
263  {
264  DepthwiseMethod::DEPTHFIRST,
265  "a64_fp32_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst",
266  constraint(is_supported<a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst>,
267  has_channel_multiplier),
268  nullptr,
269  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
270  auto strat = new a64_fp32_packed_to_nhwc_3x3_s2_with_multiplier_output3x3_mla_depthfirst(args.cpu_info);
271  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
272  },
273  },
274  {
275  DepthwiseMethod::DEPTHFIRST,
276  "a64_fp32_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst",
277  constraint(is_supported<a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst>,
278  has_channel_multiplier),
279  nullptr,
280  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
281  auto strat = new a64_fp32_packed_to_nhwc_5x5_s1_with_multiplier_output2x4_mla_depthfirst(args.cpu_info);
282  return new DepthwiseDepthfirstMultiplier<float>(strat, args);
283  },
284  },
285  {
286  DepthwiseMethod::DEPTHFIRST,
287  "a64_fp32_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
288  constraint(has_channel_multiplier),
289  nullptr,
290  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<float, float, float> * {
291  auto kern = new a64_fp32_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
292  auto strat = new GenericDepthfirstMultiplierStrategy<float>(kern, args);
293  return new DepthwiseDepthfirstMultiplier<float, float, float, float, true>(strat, args);
294  },
295  },
296 #endif // defined(__aarch64__)
297  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
298 };
299 
300 template <>
302 {
303  return depthwise_fp32_methods;
304 }
305 
306 template UniqueDepthwiseCommon<float> depthwise(const DepthwiseArgs &, const Nothing &);
307 template std::vector<KernelDescription> get_compatible_kernels<float>(const DepthwiseArgs &, const Nothing &);
308 
309 } // namespace depthwise
310 } // namespace arm_conv
T roundup(const T a, const T b)
Definition: utils.hpp:70
const DepthwiseImplementation< float > * depthwise_implementation_list()
T iceildiv(const T a, const T b)
Definition: utils.hpp:65
template std::vector< KernelDescription > get_compatible_kernels< float >(const DepthwiseArgs &, const Nothing &)
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)