35 #if defined(__aarch64__)
36 #if defined(ARM_COMPUTE_ENABLE_SVE)
37 #if defined(ARM_COMPUTE_ENABLE_SME2)
42 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
50 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
65 #endif // defined(__aarch64__)
76 #if defined(__aarch64__)
77 uint64_t not_preferred(
const DepthwiseArgs &,
const Requantize32 &)
79 return std::numeric_limits<uint64_t>::max();
81 #endif // defined(__aarch64__)
84 static const DepthwiseImplementation<uint8_t, uint8_t, uint8_t, Requantize32> depthwise_u8q_methods[] = {
85 #if defined(__aarch64__)
86 #if defined(ARM_COMPUTE_ENABLE_SVE)
87 #if defined(ARM_COMPUTE_ENABLE_SME2)
89 DepthwiseMethod::PLANAR,
90 "sme2_u8q_planar_3x3_s1_4rows_dot_za",
91 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
92 is_supported<sme2_u8q_planar_3x3_s1_4rows_dot_za>,
93 has_no_channel_multiplier,
94 qp_has_no_left_shift, no_prime_right_pad),
96 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
97 auto strat =
new sme2_u8q_planar_3x3_s1_4rows_dot_za(
args.cpu_info);
98 return new DepthwisePlanar<uint8_t>(strat,
args, qp);
102 DepthwiseMethod::PLANAR,
103 "sme2_u8q_planar_3x3_s2_4rows_dot_za",
104 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
105 is_supported<sme2_u8q_planar_3x3_s2_4rows_dot_za>,
106 has_no_channel_multiplier,
107 qp_has_no_left_shift, no_prime_right_pad),
109 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
110 auto strat =
new sme2_u8q_planar_3x3_s2_4rows_dot_za(
args.cpu_info);
111 return new DepthwisePlanar<uint8_t>(strat,
args, qp);
115 DepthwiseMethod::PLANAR,
116 "sme2_u8q_planar_5x5_s1_4rows_dot_za",
117 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
118 is_supported<sme2_u8q_planar_5x5_s1_4rows_dot_za>,
119 has_no_channel_multiplier,
120 qp_has_no_left_shift, no_prime_right_pad),
122 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
123 auto strat =
new sme2_u8q_planar_5x5_s1_4rows_dot_za(
args.cpu_info);
124 return new DepthwisePlanar<uint8_t>(strat,
args, qp);
128 DepthwiseMethod::PLANAR,
129 "sme2_u8q_planar_5x5_s2_4rows_dot_za",
130 constraint<Requantize32>(cpu_has_sme, cpu_has_sme2,
131 is_supported<sme2_u8q_planar_5x5_s2_4rows_dot_za>,
132 has_no_channel_multiplier,
133 qp_has_no_left_shift, no_prime_right_pad),
135 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
136 auto strat =
new sme2_u8q_planar_5x5_s2_4rows_dot_za(
args.cpu_info);
137 return new DepthwisePlanar<uint8_t>(strat,
args, qp);
140 #endif // defined(ARM_COMPUTE_ENABLE_SME2)
142 DepthwiseMethod::DEPTHFIRST,
143 "sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
144 constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
145 qp_has_no_left_shift,
148 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
149 auto strat =
new sve_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(
args.cpu_info);
150 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
154 DepthwiseMethod::DEPTHFIRST,
155 "sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
156 constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
157 qp_has_no_left_shift,
160 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
161 auto strat =
new sve_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
162 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
166 DepthwiseMethod::DEPTHFIRST,
167 "sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
168 constraint<Requantize32>(is_supported<sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
169 qp_has_no_left_shift,
172 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
173 auto strat =
new sve_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
174 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
178 DepthwiseMethod::DEPTHFIRST,
179 "sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
180 constraint<Requantize32>(is_supported<sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
181 qp_has_no_left_shift,
184 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
185 auto strat =
new sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
186 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
190 DepthwiseMethod::DEPTHFIRST,
191 "sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
192 constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
193 qp_has_no_left_shift,
194 has_channel_multiplier,
197 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
198 auto strat =
new sve_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(
args.cpu_info);
199 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat,
args, qp);
203 DepthwiseMethod::DEPTHFIRST,
204 "sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
205 constraint<Requantize32>(is_supported<sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
206 qp_has_no_left_shift,
207 has_channel_multiplier,
210 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
211 auto strat =
new sve_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(
args.cpu_info);
212 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat,
args, qp);
215 #endif // defined(ARM_COMPUTE_ENABLE_SVE)
217 DepthwiseMethod::DEPTHFIRST,
218 "a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst",
219 constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst>,
221 qp_has_no_left_shift),
223 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
224 auto strat =
new a64_u8q_nhwc_3x3_s1_output2x2_dot_depthfirst(
args.cpu_info);
225 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
230 DepthwiseMethod::DEPTHFIRST,
231 "a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst",
232 constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst>,
234 qp_has_no_left_shift),
236 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
237 auto strat =
new a64_u8qa_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
238 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
242 DepthwiseMethod::DEPTHFIRST,
243 "a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst",
244 constraint<Requantize32>(is_supported<a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst>,
246 qp_has_no_left_shift),
248 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
249 auto strat =
new a64_u8qa_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
250 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
254 DepthwiseMethod::DEPTHFIRST,
255 "a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst",
256 constraint<Requantize32>(is_supported<a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst>,
258 qp_has_no_left_shift),
260 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
261 auto strat =
new a64_u8qa_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
262 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
267 DepthwiseMethod::DEPTHFIRST,
268 "a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst",
269 constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst>,
270 qp_has_no_left_shift),
272 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
273 auto strat =
new a64_u8q_nhwc_3x3_s1_output2x2_mla_depthfirst(
args.cpu_info);
274 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
278 DepthwiseMethod::DEPTHFIRST,
279 "a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst",
280 constraint<Requantize32>(is_supported<a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst>,
281 qp_has_no_left_shift),
283 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
284 auto strat =
new a64_u8q_nhwc_3x3_s2_output2x2_mla_depthfirst(
args.cpu_info);
285 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
289 DepthwiseMethod::DEPTHFIRST,
290 "a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst",
291 constraint<Requantize32>(is_supported<a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst>,
292 qp_has_no_left_shift),
294 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
295 auto strat =
new a64_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst(
args.cpu_info);
296 return new DepthwiseDepthfirst<uint8_t>(strat,
args, qp);
300 DepthwiseMethod::DEPTHFIRST,
301 "a64_u8q_nhwc_generic_output3x3_mla_depthfirst",
304 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
305 auto kernel =
new a64_u8q_nhwc_generic_output9_mla_depthfirst(
args.cpu_info);
306 auto strat =
new GenericDepthfirstStrategy<uint8_t>(kernel, 3, 3,
args);
307 return new DepthwiseDepthfirstGeneric<uint8_t>(strat,
args, qp);
311 DepthwiseMethod::DEPTHFIRST,
312 "a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst",
313 constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst>,
315 has_channel_multiplier,
316 qp_has_no_left_shift),
318 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
319 auto strat =
new a64_u8q_packed_to_nhwc_3x3_s2_with_multiplier_output2x4_dot_depthfirst(
args.cpu_info);
320 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat,
args, qp);
324 DepthwiseMethod::DEPTHFIRST,
325 "a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst",
326 constraint<Requantize32>(is_supported<a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst>,
328 has_channel_multiplier,
329 qp_has_no_left_shift),
331 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
332 auto strat =
new a64_u8q_packed_to_nhwc_5x5_s1_with_multiplier_output4x2_dot_depthfirst(
args.cpu_info);
333 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, false>(strat,
args, qp);
337 DepthwiseMethod::DEPTHFIRST,
338 "a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst",
339 constraint<Requantize32>(has_channel_multiplier),
341 [] (
const DepthwiseArgs &
args,
const Requantize32 &qp) -> DepthwiseCommon<uint8_t, uint8_t, uint8_t> * {
342 auto kern =
new a64_u8q_packed_to_nhwc_generic_with_multiplier_output2x8_mla_depthfirst(
args.cpu_info);
343 auto strat =
new GenericDepthfirstMultiplierStrategy<uint8_t>(kern,
args);
344 return new DepthwiseDepthfirstMultiplier<uint8_t, uint8_t, uint8_t, int32_t, true>(strat,
args, qp);
348 #endif // defined(__aarch64__)
349 { DepthwiseMethod::DEFAULT,
"",
nullptr,
nullptr,
nullptr },
355 return depthwise_u8q_methods;
358 template UniqueDepthwiseCommon<uint8_t, uint8_t, uint8_t>
depthwise(
const DepthwiseArgs &,
const Requantize32 &);