36 #if defined(__ARM_FP16_ARGS)
38 #if defined(__aarch64__)
39 #if defined(ARM_COMPUTE_ENABLE_SME2)
45 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
46 #if defined(ARM_COMPUTE_ENABLE_SVE)
52 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
53 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
61 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
62 #endif // defined(__aarch64__)
69 #if defined(__aarch64__)
70 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
71 bool prefer_premultiply(
const DepthwiseArgs &
args) {
72 if ((
args.stride_rows !=
args.stride_cols) || (
args.kernel_rows !=
args.kernel_cols))
77 unsigned int threshold;
79 if (
args.stride_rows == 1 &&
args.kernel_rows == 3)
83 else if (
args.stride_rows == 1 &&
args.kernel_rows == 5)
87 else if (
args.stride_rows == 2 &&
args.kernel_rows == 3)
91 else if (
args.stride_rows == 2 &&
args.kernel_rows == 5)
99 return args.channel_multiplier <= threshold;
102 template <
class Strategy>
103 unsigned int cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
105 if (
args.channel_multiplier > 1 && !prefer_premultiply(
args))
107 return std::numeric_limits<unsigned int>::max();
114 (
long unsigned)
args.input_channels *
args.channel_multiplier,
115 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
119 template <
class Strategy>
120 unsigned int planar_cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
126 (
long unsigned)
args.input_channels *
args.channel_multiplier,
127 arm_gemm::utils::get_vector_length<typename Strategy::return_type>(Strategy::vl_type)
131 unsigned int multiplier_cycle_estimate(
const DepthwiseArgs &
args,
const Nothing &)
133 return prefer_premultiply(
args)? std::numeric_limits<unsigned int>::max() : 0;
136 unsigned int not_preferred(
const DepthwiseArgs &,
const Nothing &) __attribute__ ((unused));
137 unsigned int not_preferred(
const DepthwiseArgs &,
const Nothing &)
139 return std::numeric_limits<unsigned int>::max();
141 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
142 #endif // defined(__aarch64__)
145 static const DepthwiseImplementation<__fp16, __fp16> depthwise_fp16_methods[] = {
146 #if defined(__aarch64__)
147 #if defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
148 #if defined(ARM_COMPUTE_ENABLE_SVE)
149 #if defined(ARM_COMPUTE_ENABLE_SME2)
151 DepthwiseMethod::DEPTHFIRST,
152 "sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
153 constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
155 cycle_estimate<sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
156 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
157 auto strat =
new sme2_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(
args.cpu_info);
158 return new DepthwiseDepthfirst<__fp16>(strat,
args);
162 DepthwiseMethod::DEPTHFIRST,
163 "sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
164 constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
166 cycle_estimate<sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
167 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
168 auto strat =
new sme2_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(
args.cpu_info);
169 return new DepthwiseDepthfirst<__fp16>(strat,
args);
173 DepthwiseMethod::DEPTHFIRST,
174 "sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
175 constraint(is_supported<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
177 cycle_estimate<sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
178 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
179 auto strat =
new sme2_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
180 return new DepthwiseDepthfirst<__fp16>(strat,
args);
184 DepthwiseMethod::DEPTHFIRST,
185 "sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
186 constraint(is_supported<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
188 cycle_estimate<sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
189 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
190 auto strat =
new sme2_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
191 return new DepthwiseDepthfirst<__fp16>(strat,
args);
195 DepthwiseMethod::DEPTHFIRST,
196 "sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
197 constraint(is_supported<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
199 cycle_estimate<sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
200 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
201 auto strat =
new sme2_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
202 return new DepthwiseDepthfirst<__fp16>(strat,
args);
205 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
207 DepthwiseMethod::DEPTHFIRST,
208 "sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
209 constraint(is_supported<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
211 cycle_estimate<sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
212 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
213 auto strat =
new sve_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(
args.cpu_info);
214 return new DepthwiseDepthfirst<__fp16>(strat,
args);
218 DepthwiseMethod::DEPTHFIRST,
219 "sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
220 constraint(is_supported<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
222 cycle_estimate<sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
223 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
224 auto strat =
new sve_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(
args.cpu_info);
225 return new DepthwiseDepthfirst<__fp16>(strat,
args);
229 DepthwiseMethod::DEPTHFIRST,
230 "sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
231 constraint(is_supported<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
233 cycle_estimate<sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
234 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
235 auto strat =
new sve_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
236 return new DepthwiseDepthfirst<__fp16>(strat,
args);
240 DepthwiseMethod::DEPTHFIRST,
241 "sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
242 constraint(is_supported<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
244 cycle_estimate<sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
245 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
246 auto strat =
new sve_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
247 return new DepthwiseDepthfirst<__fp16>(strat,
args);
251 DepthwiseMethod::DEPTHFIRST,
252 "sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
253 constraint(is_supported<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
255 cycle_estimate<sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
256 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
257 auto strat =
new sve_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
258 return new DepthwiseDepthfirst<__fp16>(strat,
args);
261 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
263 DepthwiseMethod::DEPTHFIRST,
264 "a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst",
265 constraint(is_supported<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
267 cycle_estimate<a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst>,
268 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
269 auto strat =
new a64_fp16_nhwc_3x3_s1_output4x4_mla_depthfirst(
args.cpu_info);
270 return new DepthwiseDepthfirst<__fp16>(strat,
args);
274 DepthwiseMethod::DEPTHFIRST,
275 "a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst",
276 constraint(is_supported<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
278 cycle_estimate<a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst>,
279 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
280 auto strat =
new a64_fp16_nhwc_3x3_s1_output3x3_mla_depthfirst(
args.cpu_info);
281 return new DepthwiseDepthfirst<__fp16>(strat,
args);
285 DepthwiseMethod::DEPTHFIRST,
286 "a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst",
287 constraint(is_supported<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
289 cycle_estimate<a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst>,
290 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
291 auto strat =
new a64_fp16_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
292 return new DepthwiseDepthfirst<__fp16>(strat,
args);
296 DepthwiseMethod::DEPTHFIRST,
297 "a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst",
298 constraint(is_supported<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
300 cycle_estimate<a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst>,
301 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
302 auto strat =
new a64_fp16_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
303 return new DepthwiseDepthfirst<__fp16>(strat,
args);
307 DepthwiseMethod::DEPTHFIRST,
308 "a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst",
309 constraint(is_supported<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
311 cycle_estimate<a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst>,
312 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
313 auto strat =
new a64_fp16_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
314 return new DepthwiseDepthfirst<__fp16>(strat,
args);
318 DepthwiseMethod::DEPTHFIRST,
319 "a64_fp16_nhwc_generic_output3x3_mla_depthfirst",
320 constraint(cpu_has_fp16),
322 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
323 auto kern =
new a64_fp16_nhwc_generic_output9_mla_depthfirst(
args.cpu_info);
324 auto strat =
new GenericDepthfirstStrategy<__fp16>(kern, 3, 3,
args);
325 return new DepthwiseDepthfirstGeneric<__fp16>(strat,
args);
329 DepthwiseMethod::DEPTHFIRST,
330 "a64_fp16_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
331 constraint(cpu_has_fp16, has_channel_multiplier),
332 multiplier_cycle_estimate,
333 [] (
const DepthwiseArgs &
args,
const Nothing &) -> DepthwiseCommon<__fp16, __fp16, __fp16> * {
334 auto kern =
new a64_fp16_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(
args.cpu_info);
335 auto strat =
new GenericDepthfirstMultiplierStrategy<__fp16>(kern,
args);
336 return new DepthwiseDepthfirstMultiplier<__fp16, __fp16, __fp16, __fp16, true>(strat,
args);
339 #endif // defined(ENABLE_FP16_KERNELS) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
340 #endif // defined(__aarch64__)
341 { DepthwiseMethod::DEFAULT,
"",
nullptr,
nullptr,
nullptr },
347 return depthwise_fp16_methods;
350 template UniqueDepthwiseCommon<__fp16>
depthwise(
const DepthwiseArgs &,
const Nothing &);
351 template std::vector<KernelDescription> get_compatible_kernels<__fp16>(
const DepthwiseArgs &,
const Nothing &);
356 #endif // defined(__ARM_FP16_ARGS)