Compute Library
 22.08
depthwise_fp16.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "arm_gemm_local.hpp"
26 
28 #include "depthwise_depthfirst.hpp"
31 
33 
34 // This can only be built if the target/compiler supports FP16 arguments.
35 #if defined(__ARM_FP16_ARGS)
36 
37 #if defined(__aarch64__)
38 #if defined(ARM_COMPUTE_ENABLE_SVE)
44 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
45 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
53 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
54 #endif // defined(__aarch64__)
55 
56 namespace arm_conv {
57 namespace depthwise {
58 
59 namespace
60 {
61  template <class Strategy>
62  unsigned int cycle_estimate(const DepthwiseArgs &args, const Nothing &)
63  {
64  // First-pass: compute the number of output pixels which will be computed.
65  return arm_gemm::roundup(args.output_rows, Strategy::output_rows) *
66  arm_gemm::roundup(args.output_cols, Strategy::output_cols) *
68  (long unsigned) args.input_channels * args.channel_multiplier,
69  arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
70  );
71  }
72 
73 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
74  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &) __attribute__ ((unused));
75  unsigned int not_preferred(const DepthwiseArgs &, const Nothing &)
76  {
77  return std::numeric_limits<unsigned int>::max();
78  }
79 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
80 }
81 
82 static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
83 #if defined(__aarch64__)
84 #if defined(ARM_COMPUTE_ENABLE_SVE)
85  {
86  DepthwiseMethod::DEPTHFIRST,
87  "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
88  constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
89  has_no_channel_multiplier,
90  cpu_has_sve),
91  cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
92  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
93  auto strat = new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
94  return new DepthwiseDepthfirst<__fp16>(strat, args);
95  },
96  },
97  {
98  DepthwiseMethod::DEPTHFIRST,
99  "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
100  constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
101  has_no_channel_multiplier,
102  cpu_has_sve),
103  cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
104  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
105  auto strat = new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
106  return new DepthwiseDepthfirst<__fp16>(strat, args);
107  },
108  },
109  {
110  DepthwiseMethod::DEPTHFIRST,
111  "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
112  constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
113  has_no_channel_multiplier,
114  cpu_has_sve),
115  cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
116  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
117  auto strat = new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
118  return new DepthwiseDepthfirst<__fp16>(strat, args);
119  },
120  },
121  {
122  DepthwiseMethod::DEPTHFIRST,
123  "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
124  constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
125  has_no_channel_multiplier,
126  cpu_has_sve),
127  cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
128  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
129  auto strat = new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
130  return new DepthwiseDepthfirst<__fp16>(strat, args);
131  },
132  },
133  {
134  DepthwiseMethod::DEPTHFIRST,
135  "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
136  constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
137  has_no_channel_multiplier,
138  cpu_has_sve),
139  cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
140  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
141  auto strat = new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
142  return new DepthwiseDepthfirst<__fp16>(strat, args);
143  },
144  },
145 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
146 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
147  {
148  DepthwiseMethod::DEPTHFIRST,
149  "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
150  constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
151  has_no_channel_multiplier,
152  cpu_has_fp16),
153  cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
154  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
155  auto strat = new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(args.cpu_info);
156  return new DepthwiseDepthfirst<__fp16>(strat, args);
157  },
158  },
159  {
160  DepthwiseMethod::DEPTHFIRST,
161  "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
162  constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
163  has_no_channel_multiplier,
164  cpu_has_fp16),
165  cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
166  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
167  auto strat = new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(args.cpu_info);
168  return new DepthwiseDepthfirst<__fp16>(strat, args);
169  },
170  },
171  {
172  DepthwiseMethod::DEPTHFIRST,
173  "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
174  constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
175  has_no_channel_multiplier,
176  cpu_has_fp16),
177  cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
178  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
179  auto strat = new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(args.cpu_info);
180  return new DepthwiseDepthfirst<__fp16>(strat, args);
181  },
182  },
183  {
184  DepthwiseMethod::DEPTHFIRST,
185  "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
186  constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
187  has_no_channel_multiplier,
188  cpu_has_fp16),
189  cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
190  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
191  auto strat = new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(args.cpu_info);
192  return new DepthwiseDepthfirst<__fp16>(strat, args);
193  },
194  },
195  {
196  DepthwiseMethod::DEPTHFIRST,
197  "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
198  constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
199  has_no_channel_multiplier,
200  cpu_has_fp16),
201  cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
202  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
203  auto strat = new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(args.cpu_info);
204  return new DepthwiseDepthfirst<__fp16>(strat, args);
205  },
206  },
207  {
208  DepthwiseMethod::DEPTHFIRST,
209  "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
210  constraint(has_no_channel_multiplier, cpu_has_fp16),
211  not_preferred,
212  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
213  auto kern = new a64_fp16_nhwc_generic_output9_mla_depthfirst(args.cpu_info);
214  auto strat = new GenericDepthfirstStrategy<__fp16>(kern, 3, 3, args);
215  return new DepthwiseDepthfirstGeneric<__fp16>(strat, args);
216  },
217  },
218  {
219  DepthwiseMethod::DEPTHFIRST,
220  "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
221  constraint(cpu_has_fp16, has_channel_multiplier),
222  nullptr,
223  [] (const DepthwiseArgs &args, const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
224  auto kern = new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(args.cpu_info);
225  auto strat = new GenericDepthfirstMultiplierStrategy<__fp16>(kern, args);
226  return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat, args);
227  },
228  },
229 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
230 #endif // defined(__aarch64__)
231  { DepthwiseMethod::DEFAULT, "", nullptr, nullptr, nullptr }, // End of list
232 };
233 
234 template <>
235 const DepthwiseImplementation<__fp16> *depthwise_implementation_list()
236 {
237  return depthwise_fp16_methods;
238 }
239 
240 template UniqueDepthwiseCommon<__fp16> depthwise(const DepthwiseArgs &, const Nothing &);
241 template std::vector<KernelDescription> get_compatible_kernels<__fp16>(const DepthwiseArgs &, const Nothing &);
242 
243 } // namespace depthwise
244 } // namespace arm_conv
245 
246 #endif // defined(__ARM_FP16_ARGS)
T roundup(const T a, const T b)
Definition: utils.hpp:70
const DepthwiseImplementation< float > * depthwise_implementation_list()
T iceildiv(const T a, const T b)
Definition: utils.hpp:65
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)