30 #if defined(ARM_COMPUTE_ENABLE_SVE)
35 void sve_u8q_nhwc_5x5_s1_output2x2_mla_depthfirst_impl(
36 const unsigned int n_channels,
37 const uint8_t *
const *
const inptrs,
38 const uint8_t *
const weights,
39 const int32_t *
const bias,
43 uint8_t *
const *
const outptrs
48 long unsigned int n_channels;
54 uint8_t *
const *
const outptrs;
55 const uint8_t *inptrs[36];
58 long unsigned int n_channels,
59 const uint8_t *
const *inptrs_raw,
60 const void *
const weights,
61 const int32_t *
const bias,
65 uint8_t *
const *outptrs
66 ) : n_channels(n_channels), weights(weights),
bias(
bias),
70 inptrs[0] = inptrs_raw[0];
71 inptrs[1] = inptrs_raw[1];
72 inptrs[2] = inptrs_raw[6];
73 inptrs[3] = inptrs_raw[7];
74 inptrs[4] = inptrs_raw[2];
75 inptrs[5] = inptrs_raw[8];
76 inptrs[6] = inptrs_raw[3];
77 inptrs[7] = inptrs_raw[4];
78 inptrs[8] = inptrs_raw[11];
79 inptrs[9] = inptrs_raw[12];
80 inptrs[10] = inptrs_raw[9];
81 inptrs[11] = inptrs_raw[10];
82 inptrs[12] = inptrs_raw[5];
83 inptrs[13] = inptrs_raw[13];
84 inptrs[14] = inptrs_raw[14];
85 inptrs[15] = inptrs_raw[15];
86 inptrs[16] = inptrs_raw[16];
87 inptrs[17] = inptrs_raw[17];
88 inptrs[18] = inptrs_raw[18];
89 inptrs[19] = inptrs_raw[19];
90 inptrs[20] = inptrs_raw[20];
91 inptrs[21] = inptrs_raw[21];
92 inptrs[22] = inptrs_raw[22];
93 inptrs[23] = inptrs_raw[23];
94 inptrs[24] = inptrs_raw[24];
95 inptrs[25] = inptrs_raw[25];
96 inptrs[26] = inptrs_raw[26];
97 inptrs[27] = inptrs_raw[27];
98 inptrs[28] = inptrs_raw[28];
99 inptrs[29] = inptrs_raw[29];
100 inptrs[30] = inptrs_raw[30];
101 inptrs[31] = inptrs_raw[31];
102 inptrs[32] = inptrs_raw[32];
103 inptrs[33] = inptrs_raw[33];
104 inptrs[34] = inptrs_raw[34];
105 inptrs[35] = inptrs_raw[35];
110 const Params params(n_channels, inptrs, weights,
bias, qp,
113 __asm__ __volatile__(
116 "ldr x23, [%x[params], %[offsetof_Params_requant]]\n"
117 "ldr x3, [%x[params], %[offsetof_Params_n_channels]]\n"
119 "ldr x22, [%x[params], %[offsetof_Params_outptrs]]\n"
121 "ldr x4, [%x[params], %[offsetof_Params_weights]]\n"
122 "add x21, x23, %[offsetof_Requantize32_a_offset]\n"
123 "add x20, x23, %[offsetof_Requantize32_b_offset]\n"
124 "ld1rb { z30.b }, p4/Z, [x21]\n"
125 "ld1rb { z10.b }, p4/Z, [x20]\n"
126 "add x21, x23, %[offsetof_Requantize32_c_offset]\n"
127 "add x20, x23, %[offsetof_Requantize32_minval]\n"
128 "ld1rh { z15.h }, p4/Z, [x21]\n"
129 "ld1rh { z12.h }, p4/Z, [x20]\n"
130 "add x20, x23, %[offsetof_Requantize32_maxval]\n"
131 "ld1rh { z13.h }, p4/Z, [x20]\n"
132 "ldp x5, x6, [x22, #0x0]\n"
133 "whilelt p3.h, x2, x3\n"
134 "ldp x7, x8, [x22, #0x10]\n"
135 "whilelt p2.s, x2, x3\n"
136 "whilelt p1.s, x24, x3\n"
137 "ldr x10, [%x[params], %[offsetof_Params_bias]]\n"
138 "add x17, %x[params], %[offsetof_Params_inptrs]\n"
139 "ld1w { z17.s }, p2/Z, [x10]\n"
140 "ld1w { z16.s }, p1/Z, [x10, #1, MUL VL]\n"
141 "uzp1 z14.s, z17.s, z16.s\n"
142 "ld1b { z26.h }, p4/Z, [x4]\n"
143 "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
144 "uzp2 z23.s, z17.s, z16.s\n"
145 "addvl x10, x10, #2\n"
146 "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
147 "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
150 "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
151 "ldp x9, x28, [x17, #0x0]\n"
154 "ldp x27, x26, [x17, #0x10]\n"
155 "ldp x25, x24, [x17, #0x20]\n"
158 "ldp x23, x22, [x17, #0x30]\n"
159 "ldp x21, x20, [x17, #0x40]\n"
161 ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
162 "ld1b { z22.h }, p3/Z, [x9, x2]\n"
163 "ld1b { z2.h }, p3/Z, [x28, x2]\n"
164 ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
165 ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
166 "ld1b { z11.h }, p3/Z, [x27, x2]\n"
167 "ld1b { z3.h }, p3/Z, [x26, x2]\n"
168 ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
169 ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
170 "ld1b { z29.h }, p3/Z, [x25, x2]\n"
171 "ld1b { z4.h }, p3/Z, [x24, x2]\n"
172 ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
173 ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
174 "ld1b { z31.h }, p3/Z, [x23, x2]\n"
175 "ld1b { z0.h }, p3/Z, [x22, x2]\n"
176 ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
177 ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
178 "ld1b { z19.h }, p3/Z, [x21, x2]\n"
179 "ld1b { z28.h }, p3/Z, [x20, x2]\n"
180 ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
181 ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
182 "ldr x15, [%x[params], %[offsetof_Params_requant_muls]]\n"
183 "ldr x14, [%x[params], %[offsetof_Params_requant_shifts]]\n"
184 "str x10, [%x[params], %[offsetof_Params_bias]]\n"
185 ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
186 ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
187 ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
188 ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
190 ".inst 0x449a42ce // smlalb z14.s, p4/M, z22.h, z26.h\n"
191 ".inst 0x449a46d7 // smlalt z23.s, p4/M, z22.h, z26.h\n"
192 "ldr x20, [x17, #0x50]\n"
193 "ld1b { z27.h }, p3/Z, [x20, x2]\n"
194 ".inst 0x4488404e // smlalb z14.s, p4/M, z2.h, z8.h\n"
195 ".inst 0x449a4046 // smlalb z6.s, p4/M, z2.h, z26.h\n"
196 "ldr x20, [x17, #0x58]\n"
197 ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
198 ".inst 0x449a4169 // smlalb z9.s, p4/M, z11.h, z26.h\n"
199 ".inst 0x449a4067 // smlalb z7.s, p4/M, z3.h, z26.h\n"
200 "ld1b { z5.h }, p3/Z, [x20, x2]\n"
201 "ldr x20, [x17, #0x60]\n"
202 ".inst 0x44884457 // smlalt z23.s, p4/M, z2.h, z8.h\n"
203 ".inst 0x449043ae // smlalb z14.s, p4/M, z29.h, z16.h\n"
204 "ld1b { z25.h }, p4/Z, [x4, #5, MUL VL]\n"
205 ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
206 ".inst 0x449a4452 // smlalt z18.s, p4/M, z2.h, z26.h\n"
207 ".inst 0x449a4574 // smlalt z20.s, p4/M, z11.h, z26.h\n"
208 "ld1b { z22.h }, p3/Z, [x20, x2]\n"
209 ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
210 ".inst 0x449a4461 // smlalt z1.s, p4/M, z3.h, z26.h\n"
211 ".inst 0x448843a6 // smlalb z6.s, p4/M, z29.h, z8.h\n"
212 "ldr x20, [x17, #0x68]\n"
213 "ld1b { z2.h }, p4/Z, [x4, #6, MUL VL]\n"
214 ".inst 0x44884069 // smlalb z9.s, p4/M, z3.h, z8.h\n"
215 ".inst 0x44884087 // smlalb z7.s, p4/M, z4.h, z8.h\n"
216 ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
217 "ld1b { z26.h }, p3/Z, [x20, x2]\n"
218 ".inst 0x449047b7 // smlalt z23.s, p4/M, z29.h, z16.h\n"
219 ".inst 0x449543ee // smlalb z14.s, p4/M, z31.h, z21.h\n"
220 ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
221 "ldr x20, [x17, #0x70]\n"
222 ".inst 0x448847b2 // smlalt z18.s, p4/M, z29.h, z8.h\n"
223 ".inst 0x44884474 // smlalt z20.s, p4/M, z3.h, z8.h\n"
224 "ld1b { z29.h }, p4/Z, [x4, #7, MUL VL]\n"
225 ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
226 ".inst 0x44884481 // smlalt z1.s, p4/M, z4.h, z8.h\n"
227 ".inst 0x449043e6 // smlalb z6.s, p4/M, z31.h, z16.h\n"
228 "inch x4, ALL, MUL #8\n"
229 "ld1b { z8.h }, p3/Z, [x20, x2]\n"
230 ".inst 0x44904089 // smlalb z9.s, p4/M, z4.h, z16.h\n"
231 ".inst 0x44904367 // smlalb z7.s, p4/M, z27.h, z16.h\n"
232 ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
233 "ldr x20, [x17, #0x78]\n"
234 ".inst 0x449547f7 // smlalt z23.s, p4/M, z31.h, z21.h\n"
235 ".inst 0x4491400e // smlalb z14.s, p4/M, z0.h, z17.h\n"
236 "ld1b { z24.h }, p4/Z, [x4]\n"
237 ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
238 ".inst 0x449047f2 // smlalt z18.s, p4/M, z31.h, z16.h\n"
239 ".inst 0x44904494 // smlalt z20.s, p4/M, z4.h, z16.h\n"
240 "ld1b { z31.h }, p3/Z, [x20, x2]\n"
241 ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
242 ".inst 0x44904761 // smlalt z1.s, p4/M, z27.h, z16.h\n"
243 ".inst 0x44954006 // smlalb z6.s, p4/M, z0.h, z21.h\n"
244 "ldr x22, [x17, #0x80]\n"
245 "ld1b { z16.h }, p4/Z, [x4, #1, MUL VL]\n"
246 ".inst 0x44954369 // smlalb z9.s, p4/M, z27.h, z21.h\n"
247 ".inst 0x449540a7 // smlalb z7.s, p4/M, z5.h, z21.h\n"
248 ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
249 "ldr x21, [x17, #0x88]\n"
250 ".inst 0x44914417 // smlalt z23.s, p4/M, z0.h, z17.h\n"
251 ".inst 0x4499416e // smlalb z14.s, p4/M, z11.h, z25.h\n"
252 ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
253 "ldr x20, [x17, #0x90]\n"
254 ".inst 0x44954412 // smlalt z18.s, p4/M, z0.h, z21.h\n"
255 ".inst 0x44954774 // smlalt z20.s, p4/M, z27.h, z21.h\n"
256 "ld1b { z0.h }, p3/Z, [x22, x2]\n"
257 ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
258 ".inst 0x449544a1 // smlalt z1.s, p4/M, z5.h, z21.h\n"
259 ".inst 0x449142c6 // smlalb z6.s, p4/M, z22.h, z17.h\n"
260 "ld1b { z21.h }, p4/Z, [x4, #2, MUL VL]\n"
261 ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
262 ".inst 0x449140a9 // smlalb z9.s, p4/M, z5.h, z17.h\n"
263 ".inst 0x44914267 // smlalb z7.s, p4/M, z19.h, z17.h\n"
264 "ldr x23, [x17, #0x98]\n"
265 "ldr x22, [x17, #0xa0]\n"
266 ".inst 0x44994577 // smlalt z23.s, p4/M, z11.h, z25.h\n"
267 ".inst 0x4482406e // smlalb z14.s, p4/M, z3.h, z2.h\n"
268 "ld1b { z11.h }, p3/Z, [x21, x2]\n"
269 ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
270 ".inst 0x449146d2 // smlalt z18.s, p4/M, z22.h, z17.h\n"
271 ".inst 0x449144b4 // smlalt z20.s, p4/M, z5.h, z17.h\n"
272 "ld1b { z22.h }, p4/Z, [x4, #3, MUL VL]\n"
273 ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
274 ".inst 0x44914661 // smlalt z1.s, p4/M, z19.h, z17.h\n"
275 ".inst 0x44994066 // smlalb z6.s, p4/M, z3.h, z25.h\n"
276 "ld1b { z17.h }, p3/Z, [x20, x2]\n"
277 ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
278 ".inst 0x44994389 // smlalb z9.s, p4/M, z28.h, z25.h\n"
279 ".inst 0x44994347 // smlalb z7.s, p4/M, z26.h, z25.h\n"
280 "ldr x20, [x17, #0xa8]\n"
281 "ldr x21, [x17, #0xb0]\n"
282 ".inst 0x44824477 // smlalt z23.s, p4/M, z3.h, z2.h\n"
283 ".inst 0x449d408e // smlalb z14.s, p4/M, z4.h, z29.h\n"
284 "ldr x13, [x17, #0xb8]\n"
285 "ldr x12, [x17, #0xc0]\n"
286 ".inst 0x44994472 // smlalt z18.s, p4/M, z3.h, z25.h\n"
287 ".inst 0x44994794 // smlalt z20.s, p4/M, z28.h, z25.h\n"
288 "ld1b { z3.h }, p3/Z, [x23, x2]\n"
289 ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
290 ".inst 0x44994741 // smlalt z1.s, p4/M, z26.h, z25.h\n"
291 ".inst 0x44824086 // smlalb z6.s, p4/M, z4.h, z2.h\n"
292 "ld1b { z25.h }, p4/Z, [x4, #4, MUL VL]\n"
293 ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
294 ".inst 0x44824349 // smlalb z9.s, p4/M, z26.h, z2.h\n"
295 ".inst 0x44824107 // smlalb z7.s, p4/M, z8.h, z2.h\n"
296 "ldr x11, [x17, #0xc8]\n"
297 "ldr x10, [x17, #0xd0]\n"
298 ".inst 0x449d4497 // smlalt z23.s, p4/M, z4.h, z29.h\n"
299 ".inst 0x4498436e // smlalb z14.s, p4/M, z27.h, z24.h\n"
300 "ldr x9, [x17, #0xd8]\n"
301 "ldr x28, [x17, #0xe0]\n"
302 ".inst 0x44824492 // smlalt z18.s, p4/M, z4.h, z2.h\n"
303 ".inst 0x44824754 // smlalt z20.s, p4/M, z26.h, z2.h\n"
304 "ld1b { z4.h }, p3/Z, [x22, x2]\n"
305 ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
306 ".inst 0x44824501 // smlalt z1.s, p4/M, z8.h, z2.h\n"
307 ".inst 0x449d4366 // smlalb z6.s, p4/M, z27.h, z29.h\n"
308 "ld1b { z2.h }, p4/Z, [x4, #5, MUL VL]\n"
309 ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
310 ".inst 0x449d4109 // smlalb z9.s, p4/M, z8.h, z29.h\n"
311 ".inst 0x449d43e7 // smlalb z7.s, p4/M, z31.h, z29.h\n"
312 "ldr x27, [x17, #0xe8]\n"
313 "ldr x26, [x17, #0xf0]\n"
314 ".inst 0x44984777 // smlalt z23.s, p4/M, z27.h, z24.h\n"
315 ".inst 0x449040ae // smlalb z14.s, p4/M, z5.h, z16.h\n"
316 "ldr x25, [x17, #0xf8]\n"
317 "ldr x24, [x17, #0x100]\n"
318 ".inst 0x449d4772 // smlalt z18.s, p4/M, z27.h, z29.h\n"
319 ".inst 0x449d4514 // smlalt z20.s, p4/M, z8.h, z29.h\n"
320 "ld1b { z27.h }, p3/Z, [x20, x2]\n"
321 ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
322 ".inst 0x449d47e1 // smlalt z1.s, p4/M, z31.h, z29.h\n"
323 ".inst 0x449840a6 // smlalb z6.s, p4/M, z5.h, z24.h\n"
324 "ld1b { z29.h }, p4/Z, [x4, #6, MUL VL]\n"
325 ".inst 0x454a1bbd // usublb z29.h, z29.b, z10.b\n"
326 ".inst 0x449843e9 // smlalb z9.s, p4/M, z31.h, z24.h\n"
327 ".inst 0x44984007 // smlalb z7.s, p4/M, z0.h, z24.h\n"
328 "ldr x23, [x17, #0x108]\n"
329 "ldr x22, [x17, #0x110]\n"
330 ".inst 0x449044b7 // smlalt z23.s, p4/M, z5.h, z16.h\n"
331 ".inst 0x4495438e // smlalb z14.s, p4/M, z28.h, z21.h\n"
332 "ldr x20, [x17, #0x118]\n"
333 "whilelt p0.h, x16, x3\n"
334 ".inst 0x449844b2 // smlalt z18.s, p4/M, z5.h, z24.h\n"
335 ".inst 0x449847f4 // smlalt z20.s, p4/M, z31.h, z24.h\n"
336 "ld1b { z5.h }, p3/Z, [x21, x2]\n"
337 ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
338 ".inst 0x44984401 // smlalt z1.s, p4/M, z0.h, z24.h\n"
339 ".inst 0x44904266 // smlalb z6.s, p4/M, z19.h, z16.h\n"
340 "ld1b { z24.h }, p4/Z, [x4, #7, MUL VL]\n"
341 "inch x4, ALL, MUL #8\n"
342 ".inst 0x44904009 // smlalb z9.s, p4/M, z0.h, z16.h\n"
343 ".inst 0x44904167 // smlalb z7.s, p4/M, z11.h, z16.h\n"
344 ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
345 "ldr x21, [%x[params], %[offsetof_Params_bias]]\n"
346 ".inst 0x44954797 // smlalt z23.s, p4/M, z28.h, z21.h\n"
347 ".inst 0x4496434e // smlalb z14.s, p4/M, z26.h, z22.h\n"
348 "ld1b { z28.h }, p3/Z, [x13, x2]\n"
349 ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
350 ".inst 0x44904672 // smlalt z18.s, p4/M, z19.h, z16.h\n"
351 ".inst 0x44904414 // smlalt z20.s, p4/M, z0.h, z16.h\n"
352 "ld1b { z19.h }, p4/Z, [x4]\n"
353 ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
354 ".inst 0x44904561 // smlalt z1.s, p4/M, z11.h, z16.h\n"
355 ".inst 0x44954346 // smlalb z6.s, p4/M, z26.h, z21.h\n"
356 "ld1b { z16.h }, p3/Z, [x12, x2]\n"
357 ".inst 0x455e1a10 // usublb z16.h, z16.b, z30.b\n"
358 ".inst 0x44954229 // smlalb z9.s, p4/M, z17.h, z21.h\n"
359 ".inst 0x44954067 // smlalb z7.s, p4/M, z3.h, z21.h\n"
360 ".inst 0x44964757 // smlalt z23.s, p4/M, z26.h, z22.h\n"
361 ".inst 0x4499410e // smlalb z14.s, p4/M, z8.h, z25.h\n"
362 ".inst 0x44954752 // smlalt z18.s, p4/M, z26.h, z21.h\n"
363 ".inst 0x44954634 // smlalt z20.s, p4/M, z17.h, z21.h\n"
364 "ld1b { z26.h }, p3/Z, [x11, x2]\n"
365 ".inst 0x455e1b5a // usublb z26.h, z26.b, z30.b\n"
366 ".inst 0x44954461 // smlalt z1.s, p4/M, z3.h, z21.h\n"
367 ".inst 0x44964106 // smlalb z6.s, p4/M, z8.h, z22.h\n"
368 "ld1b { z21.h }, p4/Z, [x4, #1, MUL VL]\n"
369 ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
370 ".inst 0x44964069 // smlalb z9.s, p4/M, z3.h, z22.h\n"
371 ".inst 0x44964087 // smlalb z7.s, p4/M, z4.h, z22.h\n"
372 ".inst 0x44994517 // smlalt z23.s, p4/M, z8.h, z25.h\n"
373 ".inst 0x448243ee // smlalb z14.s, p4/M, z31.h, z2.h\n"
374 ".inst 0x44964512 // smlalt z18.s, p4/M, z8.h, z22.h\n"
375 ".inst 0x44964474 // smlalt z20.s, p4/M, z3.h, z22.h\n"
376 "ld1b { z8.h }, p3/Z, [x10, x2]\n"
377 ".inst 0x455e1908 // usublb z8.h, z8.b, z30.b\n"
378 ".inst 0x44964481 // smlalt z1.s, p4/M, z4.h, z22.h\n"
379 ".inst 0x449943e6 // smlalb z6.s, p4/M, z31.h, z25.h\n"
380 "ld1b { z22.h }, p4/Z, [x4, #2, MUL VL]\n"
381 ".inst 0x454a1ad6 // usublb z22.h, z22.b, z10.b\n"
382 ".inst 0x44994089 // smlalb z9.s, p4/M, z4.h, z25.h\n"
383 ".inst 0x44994367 // smlalb z7.s, p4/M, z27.h, z25.h\n"
384 ".inst 0x448247f7 // smlalt z23.s, p4/M, z31.h, z2.h\n"
385 ".inst 0x449d400e // smlalb z14.s, p4/M, z0.h, z29.h\n"
386 ".inst 0x449947f2 // smlalt z18.s, p4/M, z31.h, z25.h\n"
387 ".inst 0x44994494 // smlalt z20.s, p4/M, z4.h, z25.h\n"
388 "ld1b { z31.h }, p3/Z, [x9, x2]\n"
389 ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
390 ".inst 0x44994761 // smlalt z1.s, p4/M, z27.h, z25.h\n"
391 ".inst 0x44824006 // smlalb z6.s, p4/M, z0.h, z2.h\n"
392 "ld1b { z25.h }, p4/Z, [x4, #3, MUL VL]\n"
393 ".inst 0x454a1b39 // usublb z25.h, z25.b, z10.b\n"
394 ".inst 0x44824369 // smlalb z9.s, p4/M, z27.h, z2.h\n"
395 ".inst 0x448240a7 // smlalb z7.s, p4/M, z5.h, z2.h\n"
396 ".inst 0x449d4417 // smlalt z23.s, p4/M, z0.h, z29.h\n"
397 ".inst 0x4498422e // smlalb z14.s, p4/M, z17.h, z24.h\n"
398 ".inst 0x44824412 // smlalt z18.s, p4/M, z0.h, z2.h\n"
399 ".inst 0x44824774 // smlalt z20.s, p4/M, z27.h, z2.h\n"
400 "ld1b { z0.h }, p3/Z, [x28, x2]\n"
401 ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
402 ".inst 0x448244a1 // smlalt z1.s, p4/M, z5.h, z2.h\n"
403 ".inst 0x449d4166 // smlalb z6.s, p4/M, z11.h, z29.h\n"
404 "ld1b { z2.h }, p4/Z, [x4, #4, MUL VL]\n"
405 ".inst 0x454a1842 // usublb z2.h, z2.b, z10.b\n"
406 ".inst 0x449d40a9 // smlalb z9.s, p4/M, z5.h, z29.h\n"
407 ".inst 0x449d4387 // smlalb z7.s, p4/M, z28.h, z29.h\n"
408 ".inst 0x44984637 // smlalt z23.s, p4/M, z17.h, z24.h\n"
409 ".inst 0x4493406e // smlalb z14.s, p4/M, z3.h, z19.h\n"
410 "ld1b { z17.h }, p3/Z, [x27, x2]\n"
411 ".inst 0x455e1a31 // usublb z17.h, z17.b, z30.b\n"
412 ".inst 0x449d4572 // smlalt z18.s, p4/M, z11.h, z29.h\n"
413 ".inst 0x449d44b4 // smlalt z20.s, p4/M, z5.h, z29.h\n"
414 "ld1b { z11.h }, p4/Z, [x4, #5, MUL VL]\n"
415 ".inst 0x454a196b // usublb z11.h, z11.b, z10.b\n"
416 ".inst 0x449d4781 // smlalt z1.s, p4/M, z28.h, z29.h\n"
417 ".inst 0x44984066 // smlalb z6.s, p4/M, z3.h, z24.h\n"
418 "ld1b { z29.h }, p3/Z, [x26, x2]\n"
419 ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
420 ".inst 0x44984209 // smlalb z9.s, p4/M, z16.h, z24.h\n"
421 ".inst 0x44984347 // smlalb z7.s, p4/M, z26.h, z24.h\n"
422 ".inst 0x44934477 // smlalt z23.s, p4/M, z3.h, z19.h\n"
423 ".inst 0x4495408e // smlalb z14.s, p4/M, z4.h, z21.h\n"
424 ".inst 0x44984472 // smlalt z18.s, p4/M, z3.h, z24.h\n"
425 ".inst 0x44984614 // smlalt z20.s, p4/M, z16.h, z24.h\n"
426 "ld1b { z3.h }, p3/Z, [x25, x2]\n"
427 ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
428 ".inst 0x44984741 // smlalt z1.s, p4/M, z26.h, z24.h\n"
429 ".inst 0x44934086 // smlalb z6.s, p4/M, z4.h, z19.h\n"
430 "ld1b { z24.h }, p4/Z, [x4, #6, MUL VL]\n"
431 ".inst 0x454a1b18 // usublb z24.h, z24.b, z10.b\n"
432 ".inst 0x44934349 // smlalb z9.s, p4/M, z26.h, z19.h\n"
433 ".inst 0x44934107 // smlalb z7.s, p4/M, z8.h, z19.h\n"
434 ".inst 0x44954497 // smlalt z23.s, p4/M, z4.h, z21.h\n"
435 ".inst 0x4496436e // smlalb z14.s, p4/M, z27.h, z22.h\n"
436 ".inst 0x44934492 // smlalt z18.s, p4/M, z4.h, z19.h\n"
437 ".inst 0x44934754 // smlalt z20.s, p4/M, z26.h, z19.h\n"
438 "ld1b { z4.h }, p3/Z, [x24, x2]\n"
439 ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
440 ".inst 0x44934501 // smlalt z1.s, p4/M, z8.h, z19.h\n"
441 ".inst 0x44954366 // smlalb z6.s, p4/M, z27.h, z21.h\n"
442 "ld1b { z19.h }, p4/Z, [x4, #7, MUL VL]\n"
443 "inch x4, ALL, MUL #8\n"
444 ".inst 0x44954109 // smlalb z9.s, p4/M, z8.h, z21.h\n"
445 ".inst 0x449543e7 // smlalb z7.s, p4/M, z31.h, z21.h\n"
446 ".inst 0x454a1a73 // usublb z19.h, z19.b, z10.b\n"
447 ".inst 0x44964777 // smlalt z23.s, p4/M, z27.h, z22.h\n"
448 ".inst 0x449940ae // smlalb z14.s, p4/M, z5.h, z25.h\n"
449 ".inst 0x44954772 // smlalt z18.s, p4/M, z27.h, z21.h\n"
450 ".inst 0x44954514 // smlalt z20.s, p4/M, z8.h, z21.h\n"
451 "ld1b { z27.h }, p3/Z, [x23, x2]\n"
452 ".inst 0x455e1b7b // usublb z27.h, z27.b, z30.b\n"
453 ".inst 0x449547e1 // smlalt z1.s, p4/M, z31.h, z21.h\n"
454 ".inst 0x449640a6 // smlalb z6.s, p4/M, z5.h, z22.h\n"
455 "ld1b { z21.h }, p4/Z, [x4]\n"
456 ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
457 ".inst 0x449643e9 // smlalb z9.s, p4/M, z31.h, z22.h\n"
458 ".inst 0x44964007 // smlalb z7.s, p4/M, z0.h, z22.h\n"
460 ".inst 0x449944b7 // smlalt z23.s, p4/M, z5.h, z25.h\n"
461 ".inst 0x4482420e // smlalb z14.s, p4/M, z16.h, z2.h\n"
462 ".inst 0x449644b2 // smlalt z18.s, p4/M, z5.h, z22.h\n"
463 ".inst 0x449647f4 // smlalt z20.s, p4/M, z31.h, z22.h\n"
464 "ld1b { z5.h }, p3/Z, [x22, x2]\n"
465 ".inst 0x455e18a5 // usublb z5.h, z5.b, z30.b\n"
466 ".inst 0x44964401 // smlalt z1.s, p4/M, z0.h, z22.h\n"
467 ".inst 0x44994386 // smlalb z6.s, p4/M, z28.h, z25.h\n"
468 "ld1w { z22.s }, p2/Z, [x15]\n"
469 ".inst 0x44994009 // smlalb z9.s, p4/M, z0.h, z25.h\n"
470 ".inst 0x44994227 // smlalb z7.s, p4/M, z17.h, z25.h\n"
471 ".inst 0x44824617 // smlalt z23.s, p4/M, z16.h, z2.h\n"
472 ".inst 0x448b434e // smlalb z14.s, p4/M, z26.h, z11.h\n"
473 "ld1w { z16.s }, p1/Z, [x15, #1, MUL VL]\n"
474 "addvl x15, x15, #2\n"
475 ".inst 0x44994792 // smlalt z18.s, p4/M, z28.h, z25.h\n"
476 ".inst 0x44994414 // smlalt z20.s, p4/M, z0.h, z25.h\n"
477 "ld1b { z28.h }, p3/Z, [x20, x2]\n"
478 ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
479 ".inst 0x44994621 // smlalt z1.s, p4/M, z17.h, z25.h\n"
480 ".inst 0x44824346 // smlalb z6.s, p4/M, z26.h, z2.h\n"
481 "uzp1 z25.s, z22.s, z16.s\n"
483 ".inst 0x448243a9 // smlalb z9.s, p4/M, z29.h, z2.h\n"
484 ".inst 0x44824067 // smlalb z7.s, p4/M, z3.h, z2.h\n"
485 "uzp2 z16.s, z22.s, z16.s\n"
486 "ld1w { z22.s }, p2/Z, [x14]\n"
487 ".inst 0x448b4757 // smlalt z23.s, p4/M, z26.h, z11.h\n"
488 ".inst 0x4498410e // smlalb z14.s, p4/M, z8.h, z24.h\n"
491 ".inst 0x44824752 // smlalt z18.s, p4/M, z26.h, z2.h\n"
492 ".inst 0x448247b4 // smlalt z20.s, p4/M, z29.h, z2.h\n"
493 "ld1w { z26.s }, p1/Z, [x14, #1, MUL VL]\n"
494 "uzp1 z29.s, z22.s, z26.s\n"
495 ".inst 0x44824461 // smlalt z1.s, p4/M, z3.h, z2.h\n"
496 ".inst 0x448b4106 // smlalb z6.s, p4/M, z8.h, z11.h\n"
497 "uzp2 z22.s, z22.s, z26.s\n"
498 "whilelt p2.s, x2, x3\n"
499 ".inst 0x448b4069 // smlalb z9.s, p4/M, z3.h, z11.h\n"
500 ".inst 0x448b4087 // smlalb z7.s, p4/M, z4.h, z11.h\n"
501 "whilelt p1.s, x20, x3\n"
502 "whilelt p3.h, x2, x3\n"
503 ".inst 0x44984517 // smlalt z23.s, p4/M, z8.h, z24.h\n"
504 ".inst 0x449343ee // smlalb z14.s, p4/M, z31.h, z19.h\n"
505 "addvl x14, x14, #2\n"
506 ".inst 0x448b4512 // smlalt z18.s, p4/M, z8.h, z11.h\n"
507 ".inst 0x448b4474 // smlalt z20.s, p4/M, z3.h, z11.h\n"
508 ".inst 0x448b4481 // smlalt z1.s, p4/M, z4.h, z11.h\n"
509 ".inst 0x449843e6 // smlalb z6.s, p4/M, z31.h, z24.h\n"
510 ".inst 0x44984089 // smlalb z9.s, p4/M, z4.h, z24.h\n"
511 ".inst 0x44984367 // smlalb z7.s, p4/M, z27.h, z24.h\n"
512 ".inst 0x449347f7 // smlalt z23.s, p4/M, z31.h, z19.h\n"
513 ".inst 0x4495400e // smlalb z14.s, p4/M, z0.h, z21.h\n"
514 ".inst 0x04b975ce // sqrdmulh z14.s, z14.s, z25.s\n"
515 ".inst 0x449847f2 // smlalt z18.s, p4/M, z31.h, z24.h\n"
516 ".inst 0x44984494 // smlalt z20.s, p4/M, z4.h, z24.h\n"
517 "and z3.d, z14.d, z29.d\n"
518 ".inst 0x44984761 // smlalt z1.s, p4/M, z27.h, z24.h\n"
519 ".inst 0x44934006 // smlalb z6.s, p4/M, z0.h, z19.h\n"
520 "asr z3.s, z3.s, #0x1f\n"
521 ".inst 0x44934369 // smlalb z9.s, p4/M, z27.h, z19.h\n"
522 ".inst 0x449340a7 // smlalb z7.s, p4/M, z5.h, z19.h\n"
523 "sqadd z14.s, z14.s, z3.s\n"
524 ".inst 0x448293ae // srshl z14.s, p4/M, z14.s, z29.s\n"
525 ".inst 0x44954417 // smlalt z23.s, p4/M, z0.h, z21.h\n"
526 ".inst 0x44934412 // smlalt z18.s, p4/M, z0.h, z19.h\n"
527 ".inst 0x04b076f7 // sqrdmulh z23.s, z23.s, z16.s\n"
528 ".inst 0x44934774 // smlalt z20.s, p4/M, z27.h, z19.h\n"
529 ".inst 0x449344a1 // smlalt z1.s, p4/M, z5.h, z19.h\n"
530 "and z31.d, z23.d, z22.d\n"
531 ".inst 0x44954226 // smlalb z6.s, p4/M, z17.h, z21.h\n"
532 ".inst 0x449540a9 // smlalb z9.s, p4/M, z5.h, z21.h\n"
533 ".inst 0x04b974c6 // sqrdmulh z6.s, z6.s, z25.s\n"
534 ".inst 0x44954387 // smlalb z7.s, p4/M, z28.h, z21.h\n"
535 ".inst 0x44954632 // smlalt z18.s, p4/M, z17.h, z21.h\n"
536 ".inst 0x04b97529 // sqrdmulh z9.s, z9.s, z25.s\n"
537 ".inst 0x449544b4 // smlalt z20.s, p4/M, z5.h, z21.h\n"
538 ".inst 0x44954781 // smlalt z1.s, p4/M, z28.h, z21.h\n"
539 ".inst 0x04b974e7 // sqrdmulh z7.s, z7.s, z25.s\n"
540 "asr z31.s, z31.s, #0x1f\n"
541 "and z3.d, z6.d, z29.d\n"
542 ".inst 0x04b07652 // sqrdmulh z18.s, z18.s, z16.s\n"
543 "and z0.d, z9.d, z29.d\n"
544 ".inst 0x04b07694 // sqrdmulh z20.s, z20.s, z16.s\n"
545 "and z19.d, z7.d, z29.d\n"
546 ".inst 0x04b07421 // sqrdmulh z1.s, z1.s, z16.s\n"
547 "sqadd z23.s, z23.s, z31.s\n"
548 ".inst 0x448292d7 // srshl z23.s, p4/M, z23.s, z22.s\n"
549 "asr z3.s, z3.s, #0x1f\n"
550 "and z21.d, z18.d, z22.d\n"
551 "asr z0.s, z0.s, #0x1f\n"
552 "and z17.d, z20.d, z22.d\n"
553 "asr z19.s, z19.s, #0x1f\n"
554 "and z16.d, z1.d, z22.d\n"
555 "sqadd z6.s, z6.s, z3.s\n"
556 "asr z21.s, z21.s, #0x1f\n"
557 ".inst 0x448293a6 // srshl z6.s, p4/M, z6.s, z29.s\n"
558 "sqadd z9.s, z9.s, z0.s\n"
559 "asr z17.s, z17.s, #0x1f\n"
560 ".inst 0x448293a9 // srshl z9.s, p4/M, z9.s, z29.s\n"
561 "sqadd z7.s, z7.s, z19.s\n"
562 "asr z16.s, z16.s, #0x1f\n"
563 ".inst 0x448293a7 // srshl z7.s, p4/M, z7.s, z29.s\n"
564 "sqadd z18.s, z18.s, z21.s\n"
565 "sqadd z20.s, z20.s, z17.s\n"
566 ".inst 0x448292d2 // srshl z18.s, p4/M, z18.s, z22.s\n"
567 ".inst 0x448292d4 // srshl z20.s, p4/M, z20.s, z22.s\n"
568 "sqadd z1.s, z1.s, z16.s\n"
569 ".inst 0x453041ce // sqxtnb z14.h, z14.s\n"
570 ".inst 0x448292c1 // srshl z1.s, p4/M, z1.s, z22.s\n"
571 ".inst 0x453040c6 // sqxtnb z6.h, z6.s\n"
572 ".inst 0x45304129 // sqxtnb z9.h, z9.s\n"
573 ".inst 0x453040e7 // sqxtnb z7.h, z7.s\n"
574 ".inst 0x453046ee // sqxtnt z14.h, z23.s\n"
575 ".inst 0x45304646 // sqxtnt z6.h, z18.s\n"
576 ".inst 0x45304689 // sqxtnt z9.h, z20.s\n"
577 ".inst 0x45304427 // sqxtnt z7.h, z1.s\n"
578 "sqadd z14.h, z14.h, z15.h\n"
579 "smax z14.h, p4/M, z14.h, z12.h\n"
580 "smin z14.h, p4/M, z14.h, z13.h\n"
581 "sqadd z6.h, z6.h, z15.h\n"
582 "sqadd z9.h, z9.h, z15.h\n"
583 "smax z6.h, p4/M, z6.h, z12.h\n"
584 "smax z9.h, p4/M, z9.h, z12.h\n"
585 "sqadd z7.h, z7.h, z15.h\n"
586 "smax z7.h, p4/M, z7.h, z12.h\n"
587 "smin z6.h, p4/M, z6.h, z13.h\n"
588 "st1b { z14.h }, p0, [x5, x16]\n"
589 "smin z9.h, p4/M, z9.h, z13.h\n"
590 "smin z7.h, p4/M, z7.h, z13.h\n"
591 "st1b { z6.h }, p0, [x6, x16]\n"
592 "st1b { z9.h }, p0, [x7, x16]\n"
593 "st1b { z7.h }, p0, [x8, x16]\n"
594 "ld1w { z17.s }, p2/Z, [x21]\n"
595 "ld1w { z16.s }, p1/Z, [x21, #1, MUL VL]\n"
596 "uzp1 z14.s, z17.s, z16.s\n"
597 "ld1b { z26.h }, p4/Z, [x4]\n"
598 "ld1b { z8.h }, p4/Z, [x4, #1, MUL VL]\n"
599 "uzp2 z23.s, z17.s, z16.s\n"
600 "addvl x21, x21, #2\n"
601 "ld1b { z16.h }, p4/Z, [x4, #2, MUL VL]\n"
602 "ld1b { z21.h }, p4/Z, [x4, #3, MUL VL]\n"
604 "str x21, [%x[params], %[offsetof_Params_bias]]\n"
605 "ld1b { z17.h }, p4/Z, [x4, #4, MUL VL]\n"
606 "ldp x9, x28, [x17, #0x0]\n"
609 "ldp x27, x26, [x17, #0x10]\n"
610 "ldp x25, x24, [x17, #0x20]\n"
613 "ldp x23, x22, [x17, #0x30]\n"
614 "ldp x21, x20, [x17, #0x40]\n"
617 "ld1b { z22.h }, p3/Z, [x9, x2]\n"
618 "ld1b { z2.h }, p3/Z, [x28, x2]\n"
619 ".inst 0x454a1b5a // usublb z26.h, z26.b, z10.b\n"
620 ".inst 0x454a1908 // usublb z8.h, z8.b, z10.b\n"
621 "ld1b { z11.h }, p3/Z, [x27, x2]\n"
622 "ld1b { z3.h }, p3/Z, [x26, x2]\n"
623 ".inst 0x454a1a10 // usublb z16.h, z16.b, z10.b\n"
624 ".inst 0x454a1ab5 // usublb z21.h, z21.b, z10.b\n"
625 "ld1b { z29.h }, p3/Z, [x25, x2]\n"
626 "ld1b { z4.h }, p3/Z, [x24, x2]\n"
627 ".inst 0x454a1a31 // usublb z17.h, z17.b, z10.b\n"
628 ".inst 0x455e1ad6 // usublb z22.h, z22.b, z30.b\n"
629 "ld1b { z31.h }, p3/Z, [x23, x2]\n"
630 "ld1b { z0.h }, p3/Z, [x22, x2]\n"
631 ".inst 0x455e1842 // usublb z2.h, z2.b, z30.b\n"
632 ".inst 0x455e196b // usublb z11.h, z11.b, z30.b\n"
633 "ld1b { z19.h }, p3/Z, [x21, x2]\n"
634 "ld1b { z28.h }, p3/Z, [x20, x2]\n"
635 ".inst 0x455e1863 // usublb z3.h, z3.b, z30.b\n"
636 ".inst 0x455e1bbd // usublb z29.h, z29.b, z30.b\n"
637 ".inst 0x455e1884 // usublb z4.h, z4.b, z30.b\n"
638 ".inst 0x455e1bff // usublb z31.h, z31.b, z30.b\n"
639 ".inst 0x455e1800 // usublb z0.h, z0.b, z30.b\n"
640 ".inst 0x455e1a73 // usublb z19.h, z19.b, z30.b\n"
641 ".inst 0x455e1b9c // usublb z28.h, z28.b, z30.b\n"
644 : [offsetof_Params_bias]
"I" (offsetof(Params,
bias)), [offsetof_Params_inptrs]
"I" (offsetof(Params, inptrs)), [offsetof_Params_n_channels]
"I" (offsetof(Params, n_channels)), [offsetof_Params_outptrs]
"I" (offsetof(Params, outptrs)), [offsetof_Params_requant]
"I" (offsetof(Params, requant)), [offsetof_Params_requant_muls]
"I" (offsetof(Params,
requant_muls)), [offsetof_Params_requant_shifts]
"I" (offsetof(Params,
requant_shifts)), [offsetof_Params_weights]
"I" (offsetof(Params, weights)), [offsetof_Requantize32_a_offset]
"I" (offsetof(
arm_gemm::Requantize32, a_offset)), [offsetof_Requantize32_b_offset]
"I" (offsetof(
arm_gemm::Requantize32, b_offset)), [offsetof_Requantize32_c_offset]
"I" (offsetof(
arm_gemm::Requantize32, c_offset)), [offsetof_Requantize32_maxval]
"I" (offsetof(
arm_gemm::Requantize32, maxval)), [offsetof_Requantize32_minval]
"I" (offsetof(
arm_gemm::Requantize32, minval)), [params]
"r" (¶ms)
645 :
"cc",
"memory",
"p0",
"p1",
"p2",
"p3",
"p4",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"x14",
"x15",
"x16",
"x17",
"x20",
"x21",
"x22",
"x23",
"x24",
"x25",
"x26",
"x27",
"x28",
"z0",
"z1",
"z2",
"z3",
"z4",
"z5",
"z6",
"z7",
"z8",
"z9",
"z10",
"z11",
"z12",
"z13",
"z14",
"z15",
"z16",
"z17",
"z18",
"z19",
"z20",
"z21",
"z22",
"z23",
"z24",
"z25",
"z26",
"z27",
"z28",
"z29",
"z30",
"z31"
652 #endif // defined(ARM_COMPUTE_ENABLE_SVE)