31 #include "../../asmlib.hpp"
32 #include "../../utils.hpp"
36 void a64_smallK_hybrid_u8u32_dot_8x4(
const uint8_t *A,
int lda,
const uint8_t *B, uint32_t *C,
int ldc,
int M,
int N,
int K,
const uint32_t *,
Activation,
bool) {
37 const long loops_count =
iceildiv(
N, (
int)4) - 1;
38 const long ldab = lda *
sizeof(uint8_t);
39 const long ldcb = ldc *
sizeof(uint32_t);
40 const long odds_count =
K % 4;
43 for (
int y0=0; y0<
M; y0+=8) {
44 long loops = loops_count;
45 long oob_rows = std::max(8 - (
M-y0), 0);
46 long odds = odds_count;
47 const uint8_t *b_ptr0 =
B;
48 const uint8_t *a_ptr0 =
A + (y0 * lda);
50 uint32_t *c_ptr0 = C + (y0 * ldc);
69 "add a_ptr1, %[a_ptr0], %[lda]\n"
70 "add c_ptr1, %[c_ptr0], %[ldc]\n"
71 "add a_ptr2, a_ptr1, %[lda]\n"
72 "add c_ptr2, c_ptr1, %[ldc]\n"
73 "add a_ptr3, a_ptr2, %[lda]\n"
74 "add c_ptr3, c_ptr2, %[ldc]\n"
75 "add a_ptr4, a_ptr3, %[lda]\n"
76 "add c_ptr4, c_ptr3, %[ldc]\n"
77 "add a_ptr5, a_ptr4, %[lda]\n"
78 "add c_ptr5, c_ptr4, %[ldc]\n"
79 "add a_ptr6, a_ptr5, %[lda]\n"
80 "add c_ptr6, c_ptr5, %[ldc]\n"
81 "add a_ptr7, a_ptr6, %[lda]\n"
82 "add c_ptr7, c_ptr6, %[ldc]\n"
83 "cbz %[oob_rows], 1f\n"
84 "subs %[oob_rows], %[oob_rows], #0x1\n"
85 "add c_ptr7, %[c_ptr0], #0x0\n"
86 "add a_ptr7, %[a_ptr0], #0x0\n"
88 "subs %[oob_rows], %[oob_rows], #0x1\n"
89 "add c_ptr6, %[c_ptr0], #0x0\n"
90 "add a_ptr6, %[a_ptr0], #0x0\n"
92 "subs %[oob_rows], %[oob_rows], #0x1\n"
93 "add c_ptr5, %[c_ptr0], #0x0\n"
94 "add a_ptr5, %[a_ptr0], #0x0\n"
96 "subs %[oob_rows], %[oob_rows], #0x1\n"
97 "add c_ptr4, %[c_ptr0], #0x0\n"
98 "add a_ptr4, %[a_ptr0], #0x0\n"
100 "subs %[oob_rows], %[oob_rows], #0x1\n"
101 "add c_ptr3, %[c_ptr0], #0x0\n"
102 "add a_ptr3, %[a_ptr0], #0x0\n"
104 "subs %[oob_rows], %[oob_rows], #0x1\n"
105 "add c_ptr2, %[c_ptr0], #0x0\n"
106 "add a_ptr2, %[a_ptr0], #0x0\n"
108 "subs %[oob_rows], %[oob_rows], #0x1\n"
109 "add c_ptr1, %[c_ptr0], #0x0\n"
110 "add a_ptr1, %[a_ptr0], #0x0\n"
113 "ldr s0, [%[a_ptr0]]\n"
123 "subs %[odds], %[odds], #0x1\n"
125 "ldr b0, [%[a_ptr0]]\n"
135 "ldr h0, [%[a_ptr0]], #0x2\n"
136 "ldr h1, [a_ptr1], #0x2\n"
137 "ldr h2, [a_ptr2], #0x2\n"
138 "ldr h3, [a_ptr3], #0x2\n"
139 "ldr h4, [a_ptr4], #0x2\n"
140 "ldr h5, [a_ptr5], #0x2\n"
141 "ldr h6, [a_ptr6], #0x2\n"
142 "ldr h7, [a_ptr7], #0x2\n"
143 "subs %[odds], %[odds], #0x1\n"
147 "ld1 {v0.b}[2], [%[a_ptr0]]\n"
148 "ld1 {v1.b}[2], [a_ptr1]\n"
149 "ld1 {v2.b}[2], [a_ptr2]\n"
150 "ld1 {v3.b}[2], [a_ptr3]\n"
151 "ld1 {v4.b}[2], [a_ptr4]\n"
152 "ld1 {v5.b}[2], [a_ptr5]\n"
153 "ld1 {v6.b}[2], [a_ptr6]\n"
154 "ld1 {v7.b}[2], [a_ptr7]\n"
156 "ldr q16, [%[b_ptr0]]\n"
157 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
158 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
159 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
160 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
161 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
162 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
163 "add %[b_ptr0], %[b_ptr0], #0x10\n"
166 "subs %[loops], %[loops], #0x1\n"
174 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
175 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
176 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
177 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
178 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
179 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
180 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
181 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
184 "str q24, [%[c_ptr0]]\n"
185 "subs %[loops], %[loops], #0x1\n"
187 "ldr q16, [%[b_ptr0]]\n"
188 "add %[c_ptr0], %[c_ptr0], #0x10\n"
189 "str q25, [c_ptr1]\n"
190 "add c_ptr1, c_ptr1, #0x10\n"
192 "add %[b_ptr0], %[b_ptr0], #0x10\n"
193 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
194 "str q26, [c_ptr2]\n"
196 "add c_ptr2, c_ptr2, #0x10\n"
197 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
198 "str q27, [c_ptr3]\n"
200 "add c_ptr3, c_ptr3, #0x10\n"
201 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
202 "str q28, [c_ptr4]\n"
204 "add c_ptr4, c_ptr4, #0x10\n"
205 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
206 "str q29, [c_ptr5]\n"
208 "add c_ptr5, c_ptr5, #0x10\n"
209 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
210 "str q30, [c_ptr6]\n"
212 "add c_ptr6, c_ptr6, #0x10\n"
213 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
214 "str q31, [c_ptr7]\n"
216 "add c_ptr7, c_ptr7, #0x10\n"
217 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
218 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
219 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
220 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
221 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
222 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
223 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
224 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
225 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
226 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
229 "str q24, [%[c_ptr0]]\n"
230 "add %[c_ptr0], %[c_ptr0], #0x10\n"
232 "ldr q16, [%[b_ptr0]]\n"
233 "add %[b_ptr0], %[b_ptr0], #0x10\n"
234 "str q25, [c_ptr1]\n"
235 "add c_ptr1, c_ptr1, #0x10\n"
237 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
238 "str q26, [c_ptr2]\n"
240 "add c_ptr2, c_ptr2, #0x10\n"
241 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
242 "str q27, [c_ptr3]\n"
244 "add c_ptr3, c_ptr3, #0x10\n"
245 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
246 "str q28, [c_ptr4]\n"
248 "add c_ptr4, c_ptr4, #0x10\n"
249 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
250 "str q29, [c_ptr5]\n"
252 "add c_ptr5, c_ptr5, #0x10\n"
253 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
254 "str q30, [c_ptr6]\n"
256 "add c_ptr6, c_ptr6, #0x10\n"
257 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
258 "str q31, [c_ptr7]\n"
260 "add c_ptr7, c_ptr7, #0x10\n"
261 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
262 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
273 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
274 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
275 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
276 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
277 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
278 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
279 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
280 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
282 "str q24, [%[c_ptr0]]\n"
283 "add %[c_ptr0], %[c_ptr0], #0x10\n"
284 "str q25, [c_ptr1]\n"
285 "str q26, [c_ptr2]\n"
286 "str q27, [c_ptr3]\n"
287 "str q28, [c_ptr4]\n"
288 "str q29, [c_ptr5]\n"
289 "str q30, [c_ptr6]\n"
290 "str q31, [c_ptr7]\n"
305 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
306 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
307 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
326 "add a_ptr1, %[a_ptr0], %[lda]\n"
327 "add c_ptr1, %[c_ptr0], %[ldc]\n"
328 "add a_ptr2, a_ptr1, %[lda]\n"
329 "add c_ptr2, c_ptr1, %[ldc]\n"
330 "add a_ptr3, a_ptr2, %[lda]\n"
331 "add c_ptr3, c_ptr2, %[ldc]\n"
332 "add a_ptr4, a_ptr3, %[lda]\n"
333 "add c_ptr4, c_ptr3, %[ldc]\n"
334 "add a_ptr5, a_ptr4, %[lda]\n"
335 "add c_ptr5, c_ptr4, %[ldc]\n"
336 "add a_ptr6, a_ptr5, %[lda]\n"
337 "add c_ptr6, c_ptr5, %[ldc]\n"
338 "add a_ptr7, a_ptr6, %[lda]\n"
339 "add c_ptr7, c_ptr6, %[ldc]\n"
340 "cbz %[oob_rows], 1f\n"
341 "subs %[oob_rows], %[oob_rows], #0x1\n"
342 "add c_ptr7, %[c_ptr0], #0x0\n"
343 "add a_ptr7, %[a_ptr0], #0x0\n"
345 "subs %[oob_rows], %[oob_rows], #0x1\n"
346 "add c_ptr6, %[c_ptr0], #0x0\n"
347 "add a_ptr6, %[a_ptr0], #0x0\n"
349 "subs %[oob_rows], %[oob_rows], #0x1\n"
350 "add c_ptr5, %[c_ptr0], #0x0\n"
351 "add a_ptr5, %[a_ptr0], #0x0\n"
353 "subs %[oob_rows], %[oob_rows], #0x1\n"
354 "add c_ptr4, %[c_ptr0], #0x0\n"
355 "add a_ptr4, %[a_ptr0], #0x0\n"
357 "subs %[oob_rows], %[oob_rows], #0x1\n"
358 "add c_ptr3, %[c_ptr0], #0x0\n"
359 "add a_ptr3, %[a_ptr0], #0x0\n"
361 "subs %[oob_rows], %[oob_rows], #0x1\n"
362 "add c_ptr2, %[c_ptr0], #0x0\n"
363 "add a_ptr2, %[a_ptr0], #0x0\n"
365 "subs %[oob_rows], %[oob_rows], #0x1\n"
366 "add c_ptr1, %[c_ptr0], #0x0\n"
367 "add a_ptr1, %[a_ptr0], #0x0\n"
370 "ldr d0, [%[a_ptr0]]\n"
380 "ldr s0, [%[a_ptr0]], #0x4\n"
381 "ldr s1, [a_ptr1], #0x4\n"
382 "ldr s2, [a_ptr2], #0x4\n"
383 "ldr s3, [a_ptr3], #0x4\n"
384 "ldr s4, [a_ptr4], #0x4\n"
385 "ldr s5, [a_ptr5], #0x4\n"
386 "ldr s6, [a_ptr6], #0x4\n"
387 "ldr s7, [a_ptr7], #0x4\n"
388 "subs %[odds], %[odds], #0x1\n"
390 "ld1 {v0.b}[4], [%[a_ptr0]]\n"
391 "ld1 {v1.b}[4], [a_ptr1]\n"
392 "ld1 {v2.b}[4], [a_ptr2]\n"
393 "ld1 {v3.b}[4], [a_ptr3]\n"
394 "ld1 {v4.b}[4], [a_ptr4]\n"
395 "ld1 {v5.b}[4], [a_ptr5]\n"
396 "ld1 {v6.b}[4], [a_ptr6]\n"
397 "ld1 {v7.b}[4], [a_ptr7]\n"
400 "ld1 {v0.h}[2], [%[a_ptr0]], #2\n"
401 "ld1 {v1.h}[2], [a_ptr1], #2\n"
402 "ld1 {v2.h}[2], [a_ptr2], #2\n"
403 "ld1 {v3.h}[2], [a_ptr3], #2\n"
404 "ld1 {v4.h}[2], [a_ptr4], #2\n"
405 "ld1 {v5.h}[2], [a_ptr5], #2\n"
406 "ld1 {v6.h}[2], [a_ptr6], #2\n"
407 "ld1 {v7.h}[2], [a_ptr7], #2\n"
408 "subs %[odds], %[odds], #0x1\n"
412 "ld1 {v0.b}[6], [%[a_ptr0]]\n"
413 "ld1 {v1.b}[6], [a_ptr1]\n"
414 "ld1 {v2.b}[6], [a_ptr2]\n"
415 "ld1 {v3.b}[6], [a_ptr3]\n"
416 "ld1 {v4.b}[6], [a_ptr4]\n"
417 "ld1 {v5.b}[6], [a_ptr5]\n"
418 "ld1 {v6.b}[6], [a_ptr6]\n"
419 "ld1 {v7.b}[6], [a_ptr7]\n"
421 "ldr q16, [%[b_ptr0]]\n"
422 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
423 "ldr q17, [%[b_ptr0], #0x10]\n"
424 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
425 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
426 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
427 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
428 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
429 "add %[b_ptr0], %[b_ptr0], #0x20\n"
432 "subs %[loops], %[loops], #0x1\n"
440 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
441 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
442 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
443 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
444 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
445 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
446 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
447 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
448 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
449 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
450 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
451 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
452 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
453 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
454 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
455 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
458 "str q24, [%[c_ptr0]]\n"
459 "subs %[loops], %[loops], #0x1\n"
461 "ldr q16, [%[b_ptr0]]\n"
462 "ldr q17, [%[b_ptr0], #0x10]\n"
463 "add %[c_ptr0], %[c_ptr0], #0x10\n"
464 "str q25, [c_ptr1]\n"
465 "add c_ptr1, c_ptr1, #0x10\n"
467 "add %[b_ptr0], %[b_ptr0], #0x20\n"
468 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
469 "str q26, [c_ptr2]\n"
471 "add c_ptr2, c_ptr2, #0x10\n"
472 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
473 "str q27, [c_ptr3]\n"
475 "add c_ptr3, c_ptr3, #0x10\n"
476 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
477 "str q28, [c_ptr4]\n"
479 "add c_ptr4, c_ptr4, #0x10\n"
480 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
481 "str q29, [c_ptr5]\n"
483 "add c_ptr5, c_ptr5, #0x10\n"
484 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
485 "str q30, [c_ptr6]\n"
487 "add c_ptr6, c_ptr6, #0x10\n"
488 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
489 "str q31, [c_ptr7]\n"
491 "add c_ptr7, c_ptr7, #0x10\n"
492 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
493 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
494 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
495 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
496 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
497 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
498 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
499 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
500 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
501 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
502 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
503 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
504 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
505 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
506 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
507 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
508 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
509 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
512 "str q24, [%[c_ptr0]]\n"
513 "add %[c_ptr0], %[c_ptr0], #0x10\n"
515 "ldr q16, [%[b_ptr0]]\n"
516 "ldr q17, [%[b_ptr0], #0x10]\n"
517 "add %[b_ptr0], %[b_ptr0], #0x20\n"
518 "str q25, [c_ptr1]\n"
519 "add c_ptr1, c_ptr1, #0x10\n"
521 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
522 "str q26, [c_ptr2]\n"
524 "add c_ptr2, c_ptr2, #0x10\n"
525 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
526 "str q27, [c_ptr3]\n"
528 "add c_ptr3, c_ptr3, #0x10\n"
529 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
530 "str q28, [c_ptr4]\n"
532 "add c_ptr4, c_ptr4, #0x10\n"
533 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
534 "str q29, [c_ptr5]\n"
536 "add c_ptr5, c_ptr5, #0x10\n"
537 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
538 "str q30, [c_ptr6]\n"
540 "add c_ptr6, c_ptr6, #0x10\n"
541 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
542 "str q31, [c_ptr7]\n"
544 "add c_ptr7, c_ptr7, #0x10\n"
545 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
546 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
547 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
548 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
549 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
550 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
551 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
552 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
553 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
554 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
565 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
566 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
567 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
568 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
569 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
570 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
571 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
572 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
573 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
574 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
575 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
576 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
577 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
578 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
579 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
580 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
582 "str q24, [%[c_ptr0]]\n"
583 "add %[c_ptr0], %[c_ptr0], #0x10\n"
584 "str q25, [c_ptr1]\n"
585 "str q26, [c_ptr2]\n"
586 "str q27, [c_ptr3]\n"
587 "str q28, [c_ptr4]\n"
588 "str q29, [c_ptr5]\n"
589 "str q30, [c_ptr6]\n"
590 "str q31, [c_ptr7]\n"
605 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
606 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
607 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
626 "add a_ptr1, %[a_ptr0], %[lda]\n"
627 "add c_ptr1, %[c_ptr0], %[ldc]\n"
628 "add a_ptr2, a_ptr1, %[lda]\n"
629 "add c_ptr2, c_ptr1, %[ldc]\n"
630 "add a_ptr3, a_ptr2, %[lda]\n"
631 "add c_ptr3, c_ptr2, %[ldc]\n"
632 "add a_ptr4, a_ptr3, %[lda]\n"
633 "add c_ptr4, c_ptr3, %[ldc]\n"
634 "add a_ptr5, a_ptr4, %[lda]\n"
635 "add c_ptr5, c_ptr4, %[ldc]\n"
636 "add a_ptr6, a_ptr5, %[lda]\n"
637 "add c_ptr6, c_ptr5, %[ldc]\n"
638 "add a_ptr7, a_ptr6, %[lda]\n"
639 "add c_ptr7, c_ptr6, %[ldc]\n"
640 "cbz %[oob_rows], 1f\n"
641 "subs %[oob_rows], %[oob_rows], #0x1\n"
642 "add c_ptr7, %[c_ptr0], #0x0\n"
643 "add a_ptr7, %[a_ptr0], #0x0\n"
645 "subs %[oob_rows], %[oob_rows], #0x1\n"
646 "add c_ptr6, %[c_ptr0], #0x0\n"
647 "add a_ptr6, %[a_ptr0], #0x0\n"
649 "subs %[oob_rows], %[oob_rows], #0x1\n"
650 "add c_ptr5, %[c_ptr0], #0x0\n"
651 "add a_ptr5, %[a_ptr0], #0x0\n"
653 "subs %[oob_rows], %[oob_rows], #0x1\n"
654 "add c_ptr4, %[c_ptr0], #0x0\n"
655 "add a_ptr4, %[a_ptr0], #0x0\n"
657 "subs %[oob_rows], %[oob_rows], #0x1\n"
658 "add c_ptr3, %[c_ptr0], #0x0\n"
659 "add a_ptr3, %[a_ptr0], #0x0\n"
661 "subs %[oob_rows], %[oob_rows], #0x1\n"
662 "add c_ptr2, %[c_ptr0], #0x0\n"
663 "add a_ptr2, %[a_ptr0], #0x0\n"
665 "subs %[oob_rows], %[oob_rows], #0x1\n"
666 "add c_ptr1, %[c_ptr0], #0x0\n"
667 "add a_ptr1, %[a_ptr0], #0x0\n"
669 "ldr d0, [%[a_ptr0]], #0x8\n"
670 "ldr d1, [a_ptr1], #0x8\n"
671 "ldr d2, [a_ptr2], #0x8\n"
672 "ldr d3, [a_ptr3], #0x8\n"
673 "ldr d4, [a_ptr4], #0x8\n"
674 "ldr d5, [a_ptr5], #0x8\n"
675 "ldr d6, [a_ptr6], #0x8\n"
676 "ldr d7, [a_ptr7], #0x8\n"
678 "ld1 {v0.s}[2], [%[a_ptr0]]\n"
679 "ld1 {v1.s}[2], [a_ptr1]\n"
680 "ld1 {v2.s}[2], [a_ptr2]\n"
681 "ld1 {v3.s}[2], [a_ptr3]\n"
682 "ld1 {v4.s}[2], [a_ptr4]\n"
683 "ld1 {v5.s}[2], [a_ptr5]\n"
684 "ld1 {v6.s}[2], [a_ptr6]\n"
685 "ld1 {v7.s}[2], [a_ptr7]\n"
688 "subs %[odds], %[odds], #0x1\n"
690 "ld1 {v0.b}[8], [%[a_ptr0]]\n"
691 "ld1 {v1.b}[8], [a_ptr1]\n"
692 "ld1 {v2.b}[8], [a_ptr2]\n"
693 "ld1 {v3.b}[8], [a_ptr3]\n"
694 "ld1 {v4.b}[8], [a_ptr4]\n"
695 "ld1 {v5.b}[8], [a_ptr5]\n"
696 "ld1 {v6.b}[8], [a_ptr6]\n"
697 "ld1 {v7.b}[8], [a_ptr7]\n"
700 "ld1 {v0.h}[4], [%[a_ptr0]], #2\n"
701 "ld1 {v1.h}[4], [a_ptr1], #2\n"
702 "ld1 {v2.h}[4], [a_ptr2], #2\n"
703 "ld1 {v3.h}[4], [a_ptr3], #2\n"
704 "ld1 {v4.h}[4], [a_ptr4], #2\n"
705 "ld1 {v5.h}[4], [a_ptr5], #2\n"
706 "ld1 {v6.h}[4], [a_ptr6], #2\n"
707 "ld1 {v7.h}[4], [a_ptr7], #2\n"
708 "subs %[odds], %[odds], #0x1\n"
712 "ld1 {v0.b}[10], [%[a_ptr0]]\n"
713 "ld1 {v1.b}[10], [a_ptr1]\n"
714 "ld1 {v2.b}[10], [a_ptr2]\n"
715 "ld1 {v3.b}[10], [a_ptr3]\n"
716 "ld1 {v4.b}[10], [a_ptr4]\n"
717 "ld1 {v5.b}[10], [a_ptr5]\n"
718 "ld1 {v6.b}[10], [a_ptr6]\n"
719 "ld1 {v7.b}[10], [a_ptr7]\n"
721 "ldr q16, [%[b_ptr0]]\n"
722 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
723 "ldr q17, [%[b_ptr0], #0x10]\n"
724 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
725 "ldr q18, [%[b_ptr0], #0x20]\n"
726 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
727 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
728 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
729 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
730 "add %[b_ptr0], %[b_ptr0], #0x30\n"
733 "subs %[loops], %[loops], #0x1\n"
741 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
742 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
743 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
744 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
745 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
746 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
747 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
748 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
749 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
750 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
751 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
752 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
753 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
754 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
755 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
756 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
757 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
758 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
759 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
760 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
761 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
762 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
763 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
764 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
767 "str q24, [%[c_ptr0]]\n"
768 "subs %[loops], %[loops], #0x1\n"
770 "ldr q16, [%[b_ptr0]]\n"
771 "ldr q17, [%[b_ptr0], #0x10]\n"
772 "add %[c_ptr0], %[c_ptr0], #0x10\n"
773 "str q25, [c_ptr1]\n"
774 "add c_ptr1, c_ptr1, #0x10\n"
776 "ldr q18, [%[b_ptr0], #0x20]\n"
777 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
778 "str q26, [c_ptr2]\n"
780 "add c_ptr2, c_ptr2, #0x10\n"
781 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
782 "str q27, [c_ptr3]\n"
784 "add c_ptr3, c_ptr3, #0x10\n"
785 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
786 "str q28, [c_ptr4]\n"
788 "add c_ptr4, c_ptr4, #0x10\n"
789 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
790 "str q29, [c_ptr5]\n"
792 "add c_ptr5, c_ptr5, #0x10\n"
793 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
794 "str q30, [c_ptr6]\n"
796 "add c_ptr6, c_ptr6, #0x10\n"
797 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
798 "str q31, [c_ptr7]\n"
800 "add c_ptr7, c_ptr7, #0x10\n"
801 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
802 "add %[b_ptr0], %[b_ptr0], #0x30\n"
803 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
804 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
805 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
806 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
807 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
808 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
809 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
810 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
811 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
812 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
813 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
814 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
815 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
816 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
817 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
818 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
819 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
820 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
821 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
822 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
823 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
824 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
825 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
826 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
827 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
830 "str q24, [%[c_ptr0]]\n"
831 "add %[c_ptr0], %[c_ptr0], #0x10\n"
833 "ldr q16, [%[b_ptr0]]\n"
834 "ldr q17, [%[b_ptr0], #0x10]\n"
835 "str q25, [c_ptr1]\n"
836 "add c_ptr1, c_ptr1, #0x10\n"
838 "ldr q18, [%[b_ptr0], #0x20]\n"
839 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
840 "str q26, [c_ptr2]\n"
842 "add c_ptr2, c_ptr2, #0x10\n"
843 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
844 "str q27, [c_ptr3]\n"
846 "add c_ptr3, c_ptr3, #0x10\n"
847 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
848 "str q28, [c_ptr4]\n"
850 "add c_ptr4, c_ptr4, #0x10\n"
851 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
852 "str q29, [c_ptr5]\n"
854 "add c_ptr5, c_ptr5, #0x10\n"
855 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
856 "str q30, [c_ptr6]\n"
858 "add c_ptr6, c_ptr6, #0x10\n"
859 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
860 "str q31, [c_ptr7]\n"
862 "add c_ptr7, c_ptr7, #0x10\n"
863 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
864 "add %[b_ptr0], %[b_ptr0], #0x30\n"
865 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
866 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
867 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
868 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
869 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
870 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
871 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
872 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
873 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
874 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
875 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
876 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
877 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
878 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
879 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
880 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
881 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
892 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
893 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
894 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
895 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
896 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
897 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
898 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
899 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
900 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
901 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
902 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
903 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
904 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
905 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
906 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
907 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
908 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
909 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
910 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
911 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
912 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
913 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
914 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
915 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
917 "str q24, [%[c_ptr0]]\n"
918 "add %[c_ptr0], %[c_ptr0], #0x10\n"
919 "str q25, [c_ptr1]\n"
920 "str q26, [c_ptr2]\n"
921 "str q27, [c_ptr3]\n"
922 "str q28, [c_ptr4]\n"
923 "str q29, [c_ptr5]\n"
924 "str q30, [c_ptr6]\n"
925 "str q31, [c_ptr7]\n"
940 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
941 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
942 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
961 "add a_ptr1, %[a_ptr0], %[lda]\n"
962 "add c_ptr1, %[c_ptr0], %[ldc]\n"
963 "add a_ptr2, a_ptr1, %[lda]\n"
964 "add c_ptr2, c_ptr1, %[ldc]\n"
965 "add a_ptr3, a_ptr2, %[lda]\n"
966 "add c_ptr3, c_ptr2, %[ldc]\n"
967 "add a_ptr4, a_ptr3, %[lda]\n"
968 "add c_ptr4, c_ptr3, %[ldc]\n"
969 "add a_ptr5, a_ptr4, %[lda]\n"
970 "add c_ptr5, c_ptr4, %[ldc]\n"
971 "add a_ptr6, a_ptr5, %[lda]\n"
972 "add c_ptr6, c_ptr5, %[ldc]\n"
973 "add a_ptr7, a_ptr6, %[lda]\n"
974 "add c_ptr7, c_ptr6, %[ldc]\n"
975 "cbz %[oob_rows], 1f\n"
976 "subs %[oob_rows], %[oob_rows], #0x1\n"
977 "add c_ptr7, %[c_ptr0], #0x0\n"
978 "add a_ptr7, %[a_ptr0], #0x0\n"
980 "subs %[oob_rows], %[oob_rows], #0x1\n"
981 "add c_ptr6, %[c_ptr0], #0x0\n"
982 "add a_ptr6, %[a_ptr0], #0x0\n"
984 "subs %[oob_rows], %[oob_rows], #0x1\n"
985 "add c_ptr5, %[c_ptr0], #0x0\n"
986 "add a_ptr5, %[a_ptr0], #0x0\n"
988 "subs %[oob_rows], %[oob_rows], #0x1\n"
989 "add c_ptr4, %[c_ptr0], #0x0\n"
990 "add a_ptr4, %[a_ptr0], #0x0\n"
992 "subs %[oob_rows], %[oob_rows], #0x1\n"
993 "add c_ptr3, %[c_ptr0], #0x0\n"
994 "add a_ptr3, %[a_ptr0], #0x0\n"
996 "subs %[oob_rows], %[oob_rows], #0x1\n"
997 "add c_ptr2, %[c_ptr0], #0x0\n"
998 "add a_ptr2, %[a_ptr0], #0x0\n"
1000 "subs %[oob_rows], %[oob_rows], #0x1\n"
1001 "add c_ptr1, %[c_ptr0], #0x0\n"
1002 "add a_ptr1, %[a_ptr0], #0x0\n"
1004 "cbnz %[odds], 2f\n"
1005 "ldr q0, [%[a_ptr0]]\n"
1006 "ldr q1, [a_ptr1]\n"
1007 "ldr q2, [a_ptr2]\n"
1008 "ldr q3, [a_ptr3]\n"
1009 "ldr q4, [a_ptr4]\n"
1010 "ldr q5, [a_ptr5]\n"
1011 "ldr q6, [a_ptr6]\n"
1012 "ldr q7, [a_ptr7]\n"
1015 "ldr d0, [%[a_ptr0]], #0x8\n"
1016 "ldr d1, [a_ptr1], #0x8\n"
1017 "ldr d2, [a_ptr2], #0x8\n"
1018 "ldr d3, [a_ptr3], #0x8\n"
1019 "ldr d4, [a_ptr4], #0x8\n"
1020 "ldr d5, [a_ptr5], #0x8\n"
1021 "ldr d6, [a_ptr6], #0x8\n"
1022 "ldr d7, [a_ptr7], #0x8\n"
1023 "ld1 {v0.s}[2], [%[a_ptr0]], #4\n"
1024 "ld1 {v1.s}[2], [a_ptr1], #4\n"
1025 "ld1 {v2.s}[2], [a_ptr2], #4\n"
1026 "ld1 {v3.s}[2], [a_ptr3], #4\n"
1027 "ld1 {v4.s}[2], [a_ptr4], #4\n"
1028 "ld1 {v5.s}[2], [a_ptr5], #4\n"
1029 "ld1 {v6.s}[2], [a_ptr6], #4\n"
1030 "ld1 {v7.s}[2], [a_ptr7], #4\n"
1031 "subs %[odds], %[odds], #0x1\n"
1033 "ld1 {v0.b}[12], [%[a_ptr0]]\n"
1034 "ld1 {v1.b}[12], [a_ptr1]\n"
1035 "ld1 {v2.b}[12], [a_ptr2]\n"
1036 "ld1 {v3.b}[12], [a_ptr3]\n"
1037 "ld1 {v4.b}[12], [a_ptr4]\n"
1038 "ld1 {v5.b}[12], [a_ptr5]\n"
1039 "ld1 {v6.b}[12], [a_ptr6]\n"
1040 "ld1 {v7.b}[12], [a_ptr7]\n"
1043 "ld1 {v0.h}[6], [%[a_ptr0]], #2\n"
1044 "ld1 {v1.h}[6], [a_ptr1], #2\n"
1045 "ld1 {v2.h}[6], [a_ptr2], #2\n"
1046 "ld1 {v3.h}[6], [a_ptr3], #2\n"
1047 "ld1 {v4.h}[6], [a_ptr4], #2\n"
1048 "ld1 {v5.h}[6], [a_ptr5], #2\n"
1049 "ld1 {v6.h}[6], [a_ptr6], #2\n"
1050 "ld1 {v7.h}[6], [a_ptr7], #2\n"
1051 "subs %[odds], %[odds], #0x1\n"
1055 "ld1 {v0.b}[14], [%[a_ptr0]]\n"
1056 "ld1 {v1.b}[14], [a_ptr1]\n"
1057 "ld1 {v2.b}[14], [a_ptr2]\n"
1058 "ld1 {v3.b}[14], [a_ptr3]\n"
1059 "ld1 {v4.b}[14], [a_ptr4]\n"
1060 "ld1 {v5.b}[14], [a_ptr5]\n"
1061 "ld1 {v6.b}[14], [a_ptr6]\n"
1062 "ld1 {v7.b}[14], [a_ptr7]\n"
1064 "ldr q16, [%[b_ptr0]]\n"
1065 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1066 "ldr q17, [%[b_ptr0], #0x10]\n"
1067 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1068 "ldr q18, [%[b_ptr0], #0x20]\n"
1069 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1070 "ldr q19, [%[b_ptr0], #0x30]\n"
1071 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1072 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1073 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1074 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1075 "cbz %[loops], 6f\n"
1077 "subs %[loops], %[loops], #0x1\n"
1085 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1086 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
1087 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
1088 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
1089 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
1090 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
1091 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
1092 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
1093 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1094 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
1095 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
1096 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
1097 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
1098 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
1099 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
1100 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
1101 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1102 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
1103 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
1104 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
1105 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
1106 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
1107 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
1108 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
1109 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1110 ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
1111 ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
1112 ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
1113 ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
1114 ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
1115 ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
1116 ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
1119 "str q24, [%[c_ptr0]]\n"
1120 "subs %[loops], %[loops], #0x1\n"
1122 "ldr q16, [%[b_ptr0]]\n"
1123 "ldr q17, [%[b_ptr0], #0x10]\n"
1124 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1125 "str q25, [c_ptr1]\n"
1126 "add c_ptr1, c_ptr1, #0x10\n"
1128 "ldr q18, [%[b_ptr0], #0x20]\n"
1129 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1130 "str q26, [c_ptr2]\n"
1132 "ldr q19, [%[b_ptr0], #0x30]\n"
1133 "add c_ptr2, c_ptr2, #0x10\n"
1134 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
1135 "str q27, [c_ptr3]\n"
1137 "add c_ptr3, c_ptr3, #0x10\n"
1138 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
1139 "str q28, [c_ptr4]\n"
1141 "add c_ptr4, c_ptr4, #0x10\n"
1142 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
1143 "str q29, [c_ptr5]\n"
1145 "add c_ptr5, c_ptr5, #0x10\n"
1146 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
1147 "str q30, [c_ptr6]\n"
1149 "add c_ptr6, c_ptr6, #0x10\n"
1150 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
1151 "str q31, [c_ptr7]\n"
1153 "add c_ptr7, c_ptr7, #0x10\n"
1154 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
1155 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1156 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
1157 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1158 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1159 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1160 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
1161 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1162 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
1163 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1164 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
1165 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1166 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
1167 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1168 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
1169 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1170 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
1171 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1172 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
1173 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1174 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
1175 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
1176 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
1177 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
1178 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
1179 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
1180 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
1181 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1182 ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
1183 ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
1184 ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
1185 ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
1186 ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
1187 ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
1188 ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
1191 "str q24, [%[c_ptr0]]\n"
1192 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1194 "ldr q16, [%[b_ptr0]]\n"
1195 "ldr q17, [%[b_ptr0], #0x10]\n"
1196 "str q25, [c_ptr1]\n"
1197 "add c_ptr1, c_ptr1, #0x10\n"
1199 "ldr q18, [%[b_ptr0], #0x20]\n"
1200 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1201 "str q26, [c_ptr2]\n"
1203 "ldr q19, [%[b_ptr0], #0x30]\n"
1204 "add c_ptr2, c_ptr2, #0x10\n"
1205 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
1206 "str q27, [c_ptr3]\n"
1208 "add c_ptr3, c_ptr3, #0x10\n"
1209 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
1210 "str q28, [c_ptr4]\n"
1212 "add c_ptr4, c_ptr4, #0x10\n"
1213 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
1214 "str q29, [c_ptr5]\n"
1216 "add c_ptr5, c_ptr5, #0x10\n"
1217 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
1218 "str q30, [c_ptr6]\n"
1220 "add c_ptr6, c_ptr6, #0x10\n"
1221 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
1222 "str q31, [c_ptr7]\n"
1224 "add c_ptr7, c_ptr7, #0x10\n"
1225 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
1226 "add %[b_ptr0], %[b_ptr0], #0x40\n"
1227 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
1228 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1229 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
1230 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
1231 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
1232 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
1233 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
1234 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
1235 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
1236 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1237 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
1238 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
1239 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
1240 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
1241 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
1242 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
1243 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
1244 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1245 ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
1246 ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
1247 ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
1248 ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
1249 ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
1250 ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
1251 ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
1262 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1263 ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
1264 ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
1265 ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
1266 ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
1267 ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
1268 ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
1269 ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
1270 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1271 ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
1272 ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
1273 ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
1274 ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
1275 ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
1276 ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
1277 ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
1278 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1279 ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
1280 ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
1281 ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
1282 ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
1283 ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
1284 ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
1285 ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
1286 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1287 ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
1288 ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
1289 ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
1290 ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
1291 ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
1292 ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
1293 ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
1295 "str q24, [%[c_ptr0]]\n"
1296 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1297 "str q25, [c_ptr1]\n"
1298 "str q26, [c_ptr2]\n"
1299 "str q27, [c_ptr3]\n"
1300 "str q28, [c_ptr4]\n"
1301 "str q29, [c_ptr5]\n"
1302 "str q30, [c_ptr6]\n"
1303 "str q31, [c_ptr7]\n"
1318 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1319 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1320 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1339 "add a_ptr1, %[a_ptr0], %[lda]\n"
1340 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1341 "add a_ptr2, a_ptr1, %[lda]\n"
1342 "add c_ptr2, c_ptr1, %[ldc]\n"
1343 "add a_ptr3, a_ptr2, %[lda]\n"
1344 "add c_ptr3, c_ptr2, %[ldc]\n"
1345 "add a_ptr4, a_ptr3, %[lda]\n"
1346 "add c_ptr4, c_ptr3, %[ldc]\n"
1347 "add a_ptr5, a_ptr4, %[lda]\n"
1348 "add c_ptr5, c_ptr4, %[ldc]\n"
1349 "add a_ptr6, a_ptr5, %[lda]\n"
1350 "add c_ptr6, c_ptr5, %[ldc]\n"
1351 "add a_ptr7, a_ptr6, %[lda]\n"
1352 "add c_ptr7, c_ptr6, %[ldc]\n"
1353 "cbz %[oob_rows], 1f\n"
1354 "subs %[oob_rows], %[oob_rows], #0x1\n"
1355 "add c_ptr7, %[c_ptr0], #0x0\n"
1356 "add a_ptr7, %[a_ptr0], #0x0\n"
1358 "subs %[oob_rows], %[oob_rows], #0x1\n"
1359 "add c_ptr6, %[c_ptr0], #0x0\n"
1360 "add a_ptr6, %[a_ptr0], #0x0\n"
1362 "subs %[oob_rows], %[oob_rows], #0x1\n"
1363 "add c_ptr5, %[c_ptr0], #0x0\n"
1364 "add a_ptr5, %[a_ptr0], #0x0\n"
1366 "subs %[oob_rows], %[oob_rows], #0x1\n"
1367 "add c_ptr4, %[c_ptr0], #0x0\n"
1368 "add a_ptr4, %[a_ptr0], #0x0\n"
1370 "subs %[oob_rows], %[oob_rows], #0x1\n"
1371 "add c_ptr3, %[c_ptr0], #0x0\n"
1372 "add a_ptr3, %[a_ptr0], #0x0\n"
1374 "subs %[oob_rows], %[oob_rows], #0x1\n"
1375 "add c_ptr2, %[c_ptr0], #0x0\n"
1376 "add a_ptr2, %[a_ptr0], #0x0\n"
1378 "subs %[oob_rows], %[oob_rows], #0x1\n"
1379 "add c_ptr1, %[c_ptr0], #0x0\n"
1380 "add a_ptr1, %[a_ptr0], #0x0\n"
1382 "cbnz %[odds], 2f\n"
1383 "ldr q0, [%[a_ptr0]], #0x10\n"
1384 "ldr q2, [a_ptr1], #0x10\n"
1385 "ldr q4, [a_ptr2], #0x10\n"
1386 "ldr q6, [a_ptr3], #0x10\n"
1387 "ldr s1, [%[a_ptr0]]\n"
1388 "ldr q8, [a_ptr4], #0x10\n"
1389 "ldr s3, [a_ptr1]\n"
1390 "ldr q10, [a_ptr5], #0x10\n"
1391 "ldr s5, [a_ptr2]\n"
1392 "ldr q12, [a_ptr6], #0x10\n"
1393 "ldr s7, [a_ptr3]\n"
1394 "ldr q14, [a_ptr7], #0x10\n"
1395 "ldr s9, [a_ptr4]\n"
1396 "ldr s11, [a_ptr5]\n"
1397 "ldr s13, [a_ptr6]\n"
1398 "ldr s15, [a_ptr7]\n"
1401 "ldr q0, [%[a_ptr0]], #0x10\n"
1402 "subs %[odds], %[odds], #0x1\n"
1403 "ldr q2, [a_ptr1], #0x10\n"
1404 "ldr q4, [a_ptr2], #0x10\n"
1405 "ldr q6, [a_ptr3], #0x10\n"
1406 "ldr q8, [a_ptr4], #0x10\n"
1407 "ldr q10, [a_ptr5], #0x10\n"
1408 "ldr q12, [a_ptr6], #0x10\n"
1409 "ldr q14, [a_ptr7], #0x10\n"
1411 "ldr b1, [%[a_ptr0]]\n"
1412 "ldr b3, [a_ptr1]\n"
1413 "ldr b5, [a_ptr2]\n"
1414 "ldr b7, [a_ptr3]\n"
1415 "ldr b9, [a_ptr4]\n"
1416 "ldr b11, [a_ptr5]\n"
1417 "ldr b13, [a_ptr6]\n"
1418 "ldr b15, [a_ptr7]\n"
1421 "ldr h1, [%[a_ptr0]], #0x2\n"
1422 "ldr h3, [a_ptr1], #0x2\n"
1423 "ldr h5, [a_ptr2], #0x2\n"
1424 "ldr h7, [a_ptr3], #0x2\n"
1425 "ldr h9, [a_ptr4], #0x2\n"
1426 "ldr h11, [a_ptr5], #0x2\n"
1427 "ldr h13, [a_ptr6], #0x2\n"
1428 "ldr h15, [a_ptr7], #0x2\n"
1429 "subs %[odds], %[odds], #0x1\n"
1433 "ld1 {v1.b}[2], [%[a_ptr0]]\n"
1434 "ld1 {v3.b}[2], [a_ptr1]\n"
1435 "ld1 {v5.b}[2], [a_ptr2]\n"
1436 "ld1 {v7.b}[2], [a_ptr3]\n"
1437 "ld1 {v9.b}[2], [a_ptr4]\n"
1438 "ld1 {v11.b}[2], [a_ptr5]\n"
1439 "ld1 {v13.b}[2], [a_ptr6]\n"
1440 "ld1 {v15.b}[2], [a_ptr7]\n"
1442 "ldr q16, [%[b_ptr0]]\n"
1443 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1444 "ldr q17, [%[b_ptr0], #0x10]\n"
1445 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1446 "ldr q18, [%[b_ptr0], #0x20]\n"
1447 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1448 "ldr q19, [%[b_ptr0], #0x30]\n"
1449 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1450 "ldr q20, [%[b_ptr0], #0x40]\n"
1451 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1452 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1453 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1454 "cbz %[loops], 6f\n"
1456 "subs %[loops], %[loops], #0x1\n"
1464 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1465 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
1466 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
1467 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
1468 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
1469 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
1470 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
1471 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
1472 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1473 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
1474 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
1475 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
1476 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
1477 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
1478 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
1479 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
1480 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1481 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
1482 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
1483 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
1484 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
1485 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
1486 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
1487 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
1488 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1489 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
1490 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
1491 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
1492 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
1493 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
1494 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
1495 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
1496 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
1497 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
1498 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
1499 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
1500 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
1501 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
1502 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
1503 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
1506 "str q24, [%[c_ptr0]]\n"
1507 "subs %[loops], %[loops], #0x1\n"
1509 "ldr q16, [%[b_ptr0]]\n"
1510 "ldr q17, [%[b_ptr0], #0x10]\n"
1511 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1512 "str q25, [c_ptr1]\n"
1513 "add c_ptr1, c_ptr1, #0x10\n"
1515 "ldr q18, [%[b_ptr0], #0x20]\n"
1516 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1517 "str q26, [c_ptr2]\n"
1519 "ldr q19, [%[b_ptr0], #0x30]\n"
1520 "ldr q20, [%[b_ptr0], #0x40]\n"
1521 "add c_ptr2, c_ptr2, #0x10\n"
1522 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
1523 "str q27, [c_ptr3]\n"
1525 "add c_ptr3, c_ptr3, #0x10\n"
1526 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
1527 "str q28, [c_ptr4]\n"
1529 "add c_ptr4, c_ptr4, #0x10\n"
1530 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
1531 "str q29, [c_ptr5]\n"
1533 "add c_ptr5, c_ptr5, #0x10\n"
1534 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
1535 "str q30, [c_ptr6]\n"
1537 "add c_ptr6, c_ptr6, #0x10\n"
1538 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
1539 "str q31, [c_ptr7]\n"
1541 "add c_ptr7, c_ptr7, #0x10\n"
1542 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
1543 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1544 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
1545 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1546 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1547 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1548 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
1549 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1550 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
1551 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1552 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
1553 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1554 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
1555 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1556 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
1557 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1558 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
1559 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1560 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
1561 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1562 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
1563 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
1564 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
1565 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
1566 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
1567 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
1568 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
1569 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1570 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
1571 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
1572 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
1573 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
1574 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
1575 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
1576 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
1577 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
1578 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
1579 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
1580 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
1581 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
1582 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
1583 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
1584 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
1587 "str q24, [%[c_ptr0]]\n"
1588 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1590 "ldr q16, [%[b_ptr0]]\n"
1591 "ldr q17, [%[b_ptr0], #0x10]\n"
1592 "str q25, [c_ptr1]\n"
1593 "add c_ptr1, c_ptr1, #0x10\n"
1595 "ldr q18, [%[b_ptr0], #0x20]\n"
1596 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1597 "str q26, [c_ptr2]\n"
1599 "ldr q19, [%[b_ptr0], #0x30]\n"
1600 "ldr q20, [%[b_ptr0], #0x40]\n"
1601 "add c_ptr2, c_ptr2, #0x10\n"
1602 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
1603 "str q27, [c_ptr3]\n"
1605 "add c_ptr3, c_ptr3, #0x10\n"
1606 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
1607 "str q28, [c_ptr4]\n"
1609 "add c_ptr4, c_ptr4, #0x10\n"
1610 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
1611 "str q29, [c_ptr5]\n"
1613 "add c_ptr5, c_ptr5, #0x10\n"
1614 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
1615 "str q30, [c_ptr6]\n"
1617 "add c_ptr6, c_ptr6, #0x10\n"
1618 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
1619 "str q31, [c_ptr7]\n"
1621 "add c_ptr7, c_ptr7, #0x10\n"
1622 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
1623 "add %[b_ptr0], %[b_ptr0], #0x50\n"
1624 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
1625 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1626 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
1627 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
1628 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
1629 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
1630 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
1631 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
1632 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
1633 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1634 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
1635 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
1636 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
1637 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
1638 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
1639 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
1640 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
1641 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1642 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
1643 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
1644 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
1645 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
1646 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
1647 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
1648 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
1649 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
1650 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
1651 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
1652 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
1653 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
1654 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
1655 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
1656 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
1667 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1668 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
1669 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
1670 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
1671 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
1672 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
1673 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
1674 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
1675 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1676 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
1677 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
1678 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
1679 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
1680 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
1681 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
1682 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
1683 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1684 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
1685 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
1686 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
1687 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
1688 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
1689 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
1690 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
1691 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1692 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
1693 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
1694 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
1695 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
1696 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
1697 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
1698 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
1699 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
1700 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
1701 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
1702 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
1703 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
1704 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
1705 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
1706 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
1708 "str q24, [%[c_ptr0]]\n"
1709 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1710 "str q25, [c_ptr1]\n"
1711 "str q26, [c_ptr2]\n"
1712 "str q27, [c_ptr3]\n"
1713 "str q28, [c_ptr4]\n"
1714 "str q29, [c_ptr5]\n"
1715 "str q30, [c_ptr6]\n"
1716 "str q31, [c_ptr7]\n"
1731 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
1732 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
1733 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
1752 "add a_ptr1, %[a_ptr0], %[lda]\n"
1753 "add c_ptr1, %[c_ptr0], %[ldc]\n"
1754 "add a_ptr2, a_ptr1, %[lda]\n"
1755 "add c_ptr2, c_ptr1, %[ldc]\n"
1756 "add a_ptr3, a_ptr2, %[lda]\n"
1757 "add c_ptr3, c_ptr2, %[ldc]\n"
1758 "add a_ptr4, a_ptr3, %[lda]\n"
1759 "add c_ptr4, c_ptr3, %[ldc]\n"
1760 "add a_ptr5, a_ptr4, %[lda]\n"
1761 "add c_ptr5, c_ptr4, %[ldc]\n"
1762 "add a_ptr6, a_ptr5, %[lda]\n"
1763 "add c_ptr6, c_ptr5, %[ldc]\n"
1764 "add a_ptr7, a_ptr6, %[lda]\n"
1765 "add c_ptr7, c_ptr6, %[ldc]\n"
1766 "cbz %[oob_rows], 1f\n"
1767 "subs %[oob_rows], %[oob_rows], #0x1\n"
1768 "add c_ptr7, %[c_ptr0], #0x0\n"
1769 "add a_ptr7, %[a_ptr0], #0x0\n"
1771 "subs %[oob_rows], %[oob_rows], #0x1\n"
1772 "add c_ptr6, %[c_ptr0], #0x0\n"
1773 "add a_ptr6, %[a_ptr0], #0x0\n"
1775 "subs %[oob_rows], %[oob_rows], #0x1\n"
1776 "add c_ptr5, %[c_ptr0], #0x0\n"
1777 "add a_ptr5, %[a_ptr0], #0x0\n"
1779 "subs %[oob_rows], %[oob_rows], #0x1\n"
1780 "add c_ptr4, %[c_ptr0], #0x0\n"
1781 "add a_ptr4, %[a_ptr0], #0x0\n"
1783 "subs %[oob_rows], %[oob_rows], #0x1\n"
1784 "add c_ptr3, %[c_ptr0], #0x0\n"
1785 "add a_ptr3, %[a_ptr0], #0x0\n"
1787 "subs %[oob_rows], %[oob_rows], #0x1\n"
1788 "add c_ptr2, %[c_ptr0], #0x0\n"
1789 "add a_ptr2, %[a_ptr0], #0x0\n"
1791 "subs %[oob_rows], %[oob_rows], #0x1\n"
1792 "add c_ptr1, %[c_ptr0], #0x0\n"
1793 "add a_ptr1, %[a_ptr0], #0x0\n"
1795 "cbnz %[odds], 2f\n"
1796 "ldr q0, [%[a_ptr0]], #0x10\n"
1797 "ldr q2, [a_ptr1], #0x10\n"
1798 "ldr q4, [a_ptr2], #0x10\n"
1799 "ldr q6, [a_ptr3], #0x10\n"
1800 "ldr d1, [%[a_ptr0]]\n"
1801 "ldr q8, [a_ptr4], #0x10\n"
1802 "ldr d3, [a_ptr1]\n"
1803 "ldr q10, [a_ptr5], #0x10\n"
1804 "ldr d5, [a_ptr2]\n"
1805 "ldr q12, [a_ptr6], #0x10\n"
1806 "ldr d7, [a_ptr3]\n"
1807 "ldr q14, [a_ptr7], #0x10\n"
1808 "ldr d9, [a_ptr4]\n"
1809 "ldr d11, [a_ptr5]\n"
1810 "ldr d13, [a_ptr6]\n"
1811 "ldr d15, [a_ptr7]\n"
1814 "ldr q0, [%[a_ptr0]], #0x10\n"
1815 "subs %[odds], %[odds], #0x1\n"
1816 "ldr q2, [a_ptr1], #0x10\n"
1817 "ldr q4, [a_ptr2], #0x10\n"
1818 "ldr s1, [%[a_ptr0]], #0x4\n"
1819 "ldr q6, [a_ptr3], #0x10\n"
1820 "ldr s3, [a_ptr1], #0x4\n"
1821 "ldr q8, [a_ptr4], #0x10\n"
1822 "ldr s5, [a_ptr2], #0x4\n"
1823 "ldr q10, [a_ptr5], #0x10\n"
1824 "ldr s7, [a_ptr3], #0x4\n"
1825 "ldr q12, [a_ptr6], #0x10\n"
1826 "ldr s9, [a_ptr4], #0x4\n"
1827 "ldr q14, [a_ptr7], #0x10\n"
1828 "ldr s11, [a_ptr5], #0x4\n"
1829 "ldr s13, [a_ptr6], #0x4\n"
1830 "ldr s15, [a_ptr7], #0x4\n"
1832 "ld1 {v1.b}[4], [%[a_ptr0]]\n"
1833 "ld1 {v3.b}[4], [a_ptr1]\n"
1834 "ld1 {v5.b}[4], [a_ptr2]\n"
1835 "ld1 {v7.b}[4], [a_ptr3]\n"
1836 "ld1 {v9.b}[4], [a_ptr4]\n"
1837 "ld1 {v11.b}[4], [a_ptr5]\n"
1838 "ld1 {v13.b}[4], [a_ptr6]\n"
1839 "ld1 {v15.b}[4], [a_ptr7]\n"
1842 "ld1 {v1.h}[2], [%[a_ptr0]], #2\n"
1843 "ld1 {v3.h}[2], [a_ptr1], #2\n"
1844 "ld1 {v5.h}[2], [a_ptr2], #2\n"
1845 "ld1 {v7.h}[2], [a_ptr3], #2\n"
1846 "ld1 {v9.h}[2], [a_ptr4], #2\n"
1847 "ld1 {v11.h}[2], [a_ptr5], #2\n"
1848 "ld1 {v13.h}[2], [a_ptr6], #2\n"
1849 "ld1 {v15.h}[2], [a_ptr7], #2\n"
1850 "subs %[odds], %[odds], #0x1\n"
1854 "ld1 {v1.b}[6], [%[a_ptr0]]\n"
1855 "ld1 {v3.b}[6], [a_ptr1]\n"
1856 "ld1 {v5.b}[6], [a_ptr2]\n"
1857 "ld1 {v7.b}[6], [a_ptr3]\n"
1858 "ld1 {v9.b}[6], [a_ptr4]\n"
1859 "ld1 {v11.b}[6], [a_ptr5]\n"
1860 "ld1 {v13.b}[6], [a_ptr6]\n"
1861 "ld1 {v15.b}[6], [a_ptr7]\n"
1863 "ldr q16, [%[b_ptr0]]\n"
1864 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
1865 "ldr q17, [%[b_ptr0], #0x10]\n"
1866 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
1867 "ldr q18, [%[b_ptr0], #0x20]\n"
1868 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
1869 "ldr q19, [%[b_ptr0], #0x30]\n"
1870 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
1871 "ldr q20, [%[b_ptr0], #0x40]\n"
1872 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
1873 "ldr q21, [%[b_ptr0], #0x50]\n"
1874 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
1875 "add %[b_ptr0], %[b_ptr0], #0x60\n"
1876 "cbz %[loops], 6f\n"
1878 "subs %[loops], %[loops], #0x1\n"
1886 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1887 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
1888 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
1889 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
1890 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
1891 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
1892 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
1893 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
1894 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1895 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
1896 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
1897 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
1898 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
1899 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
1900 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
1901 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
1902 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1903 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
1904 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
1905 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
1906 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
1907 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
1908 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
1909 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
1910 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
1911 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
1912 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
1913 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
1914 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
1915 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
1916 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
1917 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
1918 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
1919 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
1920 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
1921 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
1922 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
1923 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
1924 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
1925 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
1926 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
1927 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
1928 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
1929 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
1930 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
1931 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
1932 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
1933 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
1936 "str q24, [%[c_ptr0]]\n"
1937 "subs %[loops], %[loops], #0x1\n"
1939 "ldr q16, [%[b_ptr0]]\n"
1940 "ldr q17, [%[b_ptr0], #0x10]\n"
1941 "add %[c_ptr0], %[c_ptr0], #0x10\n"
1942 "str q25, [c_ptr1]\n"
1943 "add c_ptr1, c_ptr1, #0x10\n"
1945 "ldr q18, [%[b_ptr0], #0x20]\n"
1946 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
1947 "str q26, [c_ptr2]\n"
1949 "ldr q19, [%[b_ptr0], #0x30]\n"
1950 "ldr q20, [%[b_ptr0], #0x40]\n"
1951 "add c_ptr2, c_ptr2, #0x10\n"
1952 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
1953 "str q27, [c_ptr3]\n"
1955 "ldr q21, [%[b_ptr0], #0x50]\n"
1956 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
1957 "add c_ptr3, c_ptr3, #0x10\n"
1958 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
1959 "str q28, [c_ptr4]\n"
1961 "add c_ptr4, c_ptr4, #0x10\n"
1962 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
1963 "str q29, [c_ptr5]\n"
1965 "add c_ptr5, c_ptr5, #0x10\n"
1966 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
1967 "str q30, [c_ptr6]\n"
1969 "add c_ptr6, c_ptr6, #0x10\n"
1970 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
1971 "str q31, [c_ptr7]\n"
1973 "add c_ptr7, c_ptr7, #0x10\n"
1974 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
1975 "add %[b_ptr0], %[b_ptr0], #0x60\n"
1976 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
1977 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
1978 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
1979 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
1980 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
1981 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
1982 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
1983 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
1984 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
1985 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
1986 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
1987 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
1988 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
1989 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
1990 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
1991 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
1992 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
1993 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
1994 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
1995 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
1996 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
1997 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
1998 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
1999 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2000 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2001 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2002 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2003 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2004 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2005 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2006 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2007 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2008 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2009 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2010 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2011 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2012 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2013 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2014 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2015 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2016 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2017 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2018 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2019 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2020 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2021 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2022 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2023 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2026 "str q24, [%[c_ptr0]]\n"
2027 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2029 "ldr q16, [%[b_ptr0]]\n"
2030 "ldr q17, [%[b_ptr0], #0x10]\n"
2031 "str q25, [c_ptr1]\n"
2032 "add c_ptr1, c_ptr1, #0x10\n"
2034 "ldr q18, [%[b_ptr0], #0x20]\n"
2035 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2036 "str q26, [c_ptr2]\n"
2038 "ldr q19, [%[b_ptr0], #0x30]\n"
2039 "ldr q20, [%[b_ptr0], #0x40]\n"
2040 "add c_ptr2, c_ptr2, #0x10\n"
2041 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2042 "str q27, [c_ptr3]\n"
2044 "ldr q21, [%[b_ptr0], #0x50]\n"
2045 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2046 "add c_ptr3, c_ptr3, #0x10\n"
2047 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2048 "str q28, [c_ptr4]\n"
2050 "add c_ptr4, c_ptr4, #0x10\n"
2051 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2052 "str q29, [c_ptr5]\n"
2054 "add c_ptr5, c_ptr5, #0x10\n"
2055 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2056 "str q30, [c_ptr6]\n"
2058 "add c_ptr6, c_ptr6, #0x10\n"
2059 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2060 "str q31, [c_ptr7]\n"
2062 "add c_ptr7, c_ptr7, #0x10\n"
2063 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2064 "add %[b_ptr0], %[b_ptr0], #0x60\n"
2065 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2066 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2067 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2068 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2069 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2070 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2071 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2072 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2073 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2074 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2075 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2076 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2077 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2078 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2079 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2080 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2081 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2082 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2083 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2084 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2085 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2086 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2087 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2088 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2089 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2090 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2091 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2092 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2093 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2094 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2095 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2096 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2097 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2098 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2099 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2100 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2101 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2102 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2103 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2104 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2115 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2116 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2117 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2118 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2119 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2120 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2121 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2122 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2123 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2124 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2125 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2126 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2127 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2128 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2129 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2130 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2131 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2132 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2133 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2134 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2135 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2136 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2137 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2138 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2139 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2140 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2141 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2142 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2143 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2144 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2145 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2146 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2147 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2148 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2149 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2150 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2151 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2152 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2153 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2154 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2155 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2156 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2157 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2158 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2159 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2160 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2161 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2162 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2164 "str q24, [%[c_ptr0]]\n"
2165 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2166 "str q25, [c_ptr1]\n"
2167 "str q26, [c_ptr2]\n"
2168 "str q27, [c_ptr3]\n"
2169 "str q28, [c_ptr4]\n"
2170 "str q29, [c_ptr5]\n"
2171 "str q30, [c_ptr6]\n"
2172 "str q31, [c_ptr7]\n"
2187 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2188 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2189 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2208 "add a_ptr1, %[a_ptr0], %[lda]\n"
2209 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2210 "add a_ptr2, a_ptr1, %[lda]\n"
2211 "add c_ptr2, c_ptr1, %[ldc]\n"
2212 "add a_ptr3, a_ptr2, %[lda]\n"
2213 "add c_ptr3, c_ptr2, %[ldc]\n"
2214 "add a_ptr4, a_ptr3, %[lda]\n"
2215 "add c_ptr4, c_ptr3, %[ldc]\n"
2216 "add a_ptr5, a_ptr4, %[lda]\n"
2217 "add c_ptr5, c_ptr4, %[ldc]\n"
2218 "add a_ptr6, a_ptr5, %[lda]\n"
2219 "add c_ptr6, c_ptr5, %[ldc]\n"
2220 "add a_ptr7, a_ptr6, %[lda]\n"
2221 "add c_ptr7, c_ptr6, %[ldc]\n"
2222 "cbz %[oob_rows], 1f\n"
2223 "subs %[oob_rows], %[oob_rows], #0x1\n"
2224 "add c_ptr7, %[c_ptr0], #0x0\n"
2225 "add a_ptr7, %[a_ptr0], #0x0\n"
2227 "subs %[oob_rows], %[oob_rows], #0x1\n"
2228 "add c_ptr6, %[c_ptr0], #0x0\n"
2229 "add a_ptr6, %[a_ptr0], #0x0\n"
2231 "subs %[oob_rows], %[oob_rows], #0x1\n"
2232 "add c_ptr5, %[c_ptr0], #0x0\n"
2233 "add a_ptr5, %[a_ptr0], #0x0\n"
2235 "subs %[oob_rows], %[oob_rows], #0x1\n"
2236 "add c_ptr4, %[c_ptr0], #0x0\n"
2237 "add a_ptr4, %[a_ptr0], #0x0\n"
2239 "subs %[oob_rows], %[oob_rows], #0x1\n"
2240 "add c_ptr3, %[c_ptr0], #0x0\n"
2241 "add a_ptr3, %[a_ptr0], #0x0\n"
2243 "subs %[oob_rows], %[oob_rows], #0x1\n"
2244 "add c_ptr2, %[c_ptr0], #0x0\n"
2245 "add a_ptr2, %[a_ptr0], #0x0\n"
2247 "subs %[oob_rows], %[oob_rows], #0x1\n"
2248 "add c_ptr1, %[c_ptr0], #0x0\n"
2249 "add a_ptr1, %[a_ptr0], #0x0\n"
2251 "ldr q0, [%[a_ptr0]], #0x10\n"
2252 "ldr q2, [a_ptr1], #0x10\n"
2253 "ldr q4, [a_ptr2], #0x10\n"
2254 "ldr q6, [a_ptr3], #0x10\n"
2255 "ldr d1, [%[a_ptr0]], #0x8\n"
2256 "ldr q8, [a_ptr4], #0x10\n"
2257 "ldr d3, [a_ptr1], #0x8\n"
2258 "ldr q10, [a_ptr5], #0x10\n"
2259 "ldr d5, [a_ptr2], #0x8\n"
2260 "ldr q12, [a_ptr6], #0x10\n"
2261 "ldr d7, [a_ptr3], #0x8\n"
2262 "ldr q14, [a_ptr7], #0x10\n"
2263 "ldr d9, [a_ptr4], #0x8\n"
2264 "ldr d11, [a_ptr5], #0x8\n"
2265 "ldr d13, [a_ptr6], #0x8\n"
2266 "ldr d15, [a_ptr7], #0x8\n"
2267 "cbnz %[odds], 2f\n"
2268 "ld1 {v1.s}[2], [%[a_ptr0]]\n"
2269 "ld1 {v3.s}[2], [a_ptr1]\n"
2270 "ld1 {v5.s}[2], [a_ptr2]\n"
2271 "ld1 {v7.s}[2], [a_ptr3]\n"
2272 "ld1 {v9.s}[2], [a_ptr4]\n"
2273 "ld1 {v11.s}[2], [a_ptr5]\n"
2274 "ld1 {v13.s}[2], [a_ptr6]\n"
2275 "ld1 {v15.s}[2], [a_ptr7]\n"
2278 "subs %[odds], %[odds], #0x1\n"
2280 "ld1 {v1.b}[8], [%[a_ptr0]]\n"
2281 "ld1 {v3.b}[8], [a_ptr1]\n"
2282 "ld1 {v5.b}[8], [a_ptr2]\n"
2283 "ld1 {v7.b}[8], [a_ptr3]\n"
2284 "ld1 {v9.b}[8], [a_ptr4]\n"
2285 "ld1 {v11.b}[8], [a_ptr5]\n"
2286 "ld1 {v13.b}[8], [a_ptr6]\n"
2287 "ld1 {v15.b}[8], [a_ptr7]\n"
2290 "ld1 {v1.h}[4], [%[a_ptr0]], #2\n"
2291 "ld1 {v3.h}[4], [a_ptr1], #2\n"
2292 "ld1 {v5.h}[4], [a_ptr2], #2\n"
2293 "ld1 {v7.h}[4], [a_ptr3], #2\n"
2294 "ld1 {v9.h}[4], [a_ptr4], #2\n"
2295 "ld1 {v11.h}[4], [a_ptr5], #2\n"
2296 "ld1 {v13.h}[4], [a_ptr6], #2\n"
2297 "ld1 {v15.h}[4], [a_ptr7], #2\n"
2298 "subs %[odds], %[odds], #0x1\n"
2302 "ld1 {v1.b}[10], [%[a_ptr0]]\n"
2303 "ld1 {v3.b}[10], [a_ptr1]\n"
2304 "ld1 {v5.b}[10], [a_ptr2]\n"
2305 "ld1 {v7.b}[10], [a_ptr3]\n"
2306 "ld1 {v9.b}[10], [a_ptr4]\n"
2307 "ld1 {v11.b}[10], [a_ptr5]\n"
2308 "ld1 {v13.b}[10], [a_ptr6]\n"
2309 "ld1 {v15.b}[10], [a_ptr7]\n"
2311 "ldr q16, [%[b_ptr0]]\n"
2312 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2313 "ldr q17, [%[b_ptr0], #0x10]\n"
2314 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2315 "ldr q18, [%[b_ptr0], #0x20]\n"
2316 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2317 "ldr q19, [%[b_ptr0], #0x30]\n"
2318 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2319 "ldr q20, [%[b_ptr0], #0x40]\n"
2320 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2321 "ldr q21, [%[b_ptr0], #0x50]\n"
2322 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2323 "ldr q22, [%[b_ptr0], #0x60]\n"
2324 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2325 "cbz %[loops], 6f\n"
2327 "subs %[loops], %[loops], #0x1\n"
2335 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2336 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2337 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2338 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2339 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2340 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2341 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2342 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2343 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2344 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2345 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2346 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2347 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2348 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2349 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2350 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2351 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2352 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2353 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2354 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2355 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2356 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2357 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2358 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2359 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2360 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2361 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2362 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2363 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2364 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2365 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2366 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2367 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2368 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2369 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2370 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2371 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2372 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2373 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2374 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2375 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2376 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2377 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2378 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2379 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2380 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2381 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2382 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2383 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
2384 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
2385 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
2386 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
2387 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
2388 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
2389 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
2390 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
2393 "str q24, [%[c_ptr0]]\n"
2394 "subs %[loops], %[loops], #0x1\n"
2396 "ldr q16, [%[b_ptr0]]\n"
2397 "ldr q17, [%[b_ptr0], #0x10]\n"
2398 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2399 "str q25, [c_ptr1]\n"
2400 "add c_ptr1, c_ptr1, #0x10\n"
2402 "ldr q18, [%[b_ptr0], #0x20]\n"
2403 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2404 "str q26, [c_ptr2]\n"
2406 "ldr q19, [%[b_ptr0], #0x30]\n"
2407 "ldr q20, [%[b_ptr0], #0x40]\n"
2408 "add c_ptr2, c_ptr2, #0x10\n"
2409 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2410 "str q27, [c_ptr3]\n"
2412 "ldr q21, [%[b_ptr0], #0x50]\n"
2413 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2414 "ldr q22, [%[b_ptr0], #0x60]\n"
2415 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2416 "str q28, [c_ptr4]\n"
2418 "add c_ptr3, c_ptr3, #0x10\n"
2419 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2420 "str q29, [c_ptr5]\n"
2422 "add c_ptr4, c_ptr4, #0x10\n"
2423 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2424 "str q30, [c_ptr6]\n"
2426 "add c_ptr5, c_ptr5, #0x10\n"
2427 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2428 "str q31, [c_ptr7]\n"
2430 "add c_ptr6, c_ptr6, #0x10\n"
2431 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2432 "add c_ptr7, c_ptr7, #0x10\n"
2433 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2434 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2435 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2436 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2437 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2438 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2439 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2440 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2441 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2442 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2443 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2444 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2445 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2446 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2447 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2448 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2449 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2450 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2451 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2452 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2453 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2454 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2455 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2456 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2457 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2458 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2459 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2460 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2461 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2462 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2463 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2464 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2465 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2466 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2467 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2468 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2469 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2470 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2471 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2472 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2473 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2474 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2475 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2476 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2477 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2478 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2479 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2480 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2481 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2482 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
2483 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
2484 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
2485 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
2486 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
2487 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
2488 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
2489 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
2492 "str q24, [%[c_ptr0]]\n"
2493 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2495 "ldr q16, [%[b_ptr0]]\n"
2496 "ldr q17, [%[b_ptr0], #0x10]\n"
2497 "str q25, [c_ptr1]\n"
2498 "add c_ptr1, c_ptr1, #0x10\n"
2500 "ldr q18, [%[b_ptr0], #0x20]\n"
2501 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2502 "str q26, [c_ptr2]\n"
2504 "ldr q19, [%[b_ptr0], #0x30]\n"
2505 "ldr q20, [%[b_ptr0], #0x40]\n"
2506 "add c_ptr2, c_ptr2, #0x10\n"
2507 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2508 "str q27, [c_ptr3]\n"
2510 "ldr q21, [%[b_ptr0], #0x50]\n"
2511 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2512 "ldr q22, [%[b_ptr0], #0x60]\n"
2513 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2514 "str q28, [c_ptr4]\n"
2516 "add c_ptr3, c_ptr3, #0x10\n"
2517 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2518 "str q29, [c_ptr5]\n"
2520 "add c_ptr4, c_ptr4, #0x10\n"
2521 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2522 "str q30, [c_ptr6]\n"
2524 "add c_ptr5, c_ptr5, #0x10\n"
2525 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2526 "str q31, [c_ptr7]\n"
2528 "add c_ptr6, c_ptr6, #0x10\n"
2529 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2530 "add c_ptr7, c_ptr7, #0x10\n"
2531 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2532 "add %[b_ptr0], %[b_ptr0], #0x70\n"
2533 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2534 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2535 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2536 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2537 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2538 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2539 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2540 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2541 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2542 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2543 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2544 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2545 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2546 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2547 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2548 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2549 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2550 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2551 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2552 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2553 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2554 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2555 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2556 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2557 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2558 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2559 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2560 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2561 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2562 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2563 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2564 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2565 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2566 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2567 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2568 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2569 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2570 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2571 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2572 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
2573 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
2574 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
2575 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
2576 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
2577 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
2578 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
2579 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
2590 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2591 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2592 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2593 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2594 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2595 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2596 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2597 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2598 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2599 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2600 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2601 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2602 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2603 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2604 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2605 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2606 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2607 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2608 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2609 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2610 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2611 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2612 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2613 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2614 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2615 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2616 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2617 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2618 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2619 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2620 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2621 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2622 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2623 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2624 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2625 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2626 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2627 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2628 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2629 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2630 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2631 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2632 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2633 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2634 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2635 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2636 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2637 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2638 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
2639 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
2640 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
2641 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
2642 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
2643 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
2644 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
2645 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
2647 "str q24, [%[c_ptr0]]\n"
2648 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2649 "str q25, [c_ptr1]\n"
2650 "str q26, [c_ptr2]\n"
2651 "str q27, [c_ptr3]\n"
2652 "str q28, [c_ptr4]\n"
2653 "str q29, [c_ptr5]\n"
2654 "str q30, [c_ptr6]\n"
2655 "str q31, [c_ptr7]\n"
2670 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
2671 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
2672 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
2692 "add a_ptr1, %[a_ptr0], %[lda]\n"
2693 "add c_ptr1, %[c_ptr0], %[ldc]\n"
2694 "add a_ptr2, a_ptr1, %[lda]\n"
2695 "add c_ptr2, c_ptr1, %[ldc]\n"
2696 "add a_ptr3, a_ptr2, %[lda]\n"
2697 "add c_ptr3, c_ptr2, %[ldc]\n"
2698 "add a_ptr4, a_ptr3, %[lda]\n"
2699 "add c_ptr4, c_ptr3, %[ldc]\n"
2700 "add a_ptr5, a_ptr4, %[lda]\n"
2701 "add c_ptr5, c_ptr4, %[ldc]\n"
2702 "add a_ptr6, a_ptr5, %[lda]\n"
2703 "add c_ptr6, c_ptr5, %[ldc]\n"
2704 "add a_ptr7, a_ptr6, %[lda]\n"
2705 "add c_ptr7, c_ptr6, %[ldc]\n"
2706 "cbz %[oob_rows], 1f\n"
2707 "subs %[oob_rows], %[oob_rows], #0x1\n"
2708 "add c_ptr7, %[c_ptr0], #0x0\n"
2709 "add a_ptr7, %[a_ptr0], #0x0\n"
2711 "subs %[oob_rows], %[oob_rows], #0x1\n"
2712 "add c_ptr6, %[c_ptr0], #0x0\n"
2713 "add a_ptr6, %[a_ptr0], #0x0\n"
2715 "subs %[oob_rows], %[oob_rows], #0x1\n"
2716 "add c_ptr5, %[c_ptr0], #0x0\n"
2717 "add a_ptr5, %[a_ptr0], #0x0\n"
2719 "subs %[oob_rows], %[oob_rows], #0x1\n"
2720 "add c_ptr4, %[c_ptr0], #0x0\n"
2721 "add a_ptr4, %[a_ptr0], #0x0\n"
2723 "subs %[oob_rows], %[oob_rows], #0x1\n"
2724 "add c_ptr3, %[c_ptr0], #0x0\n"
2725 "add a_ptr3, %[a_ptr0], #0x0\n"
2727 "subs %[oob_rows], %[oob_rows], #0x1\n"
2728 "add c_ptr2, %[c_ptr0], #0x0\n"
2729 "add a_ptr2, %[a_ptr0], #0x0\n"
2731 "subs %[oob_rows], %[oob_rows], #0x1\n"
2732 "add c_ptr1, %[c_ptr0], #0x0\n"
2733 "add a_ptr1, %[a_ptr0], #0x0\n"
2735 "cbnz %[odds], 2f\n"
2736 "ldr q0, [%[a_ptr0]], #0x10\n"
2737 "ldr q2, [a_ptr1], #0x10\n"
2738 "ldr q4, [a_ptr2], #0x10\n"
2739 "ldr q6, [a_ptr3], #0x10\n"
2740 "ldr q8, [a_ptr4], #0x10\n"
2741 "ldr q10, [a_ptr5], #0x10\n"
2742 "ldr q12, [a_ptr6], #0x10\n"
2743 "ldr q14, [a_ptr7], #0x10\n"
2744 "ldr q1, [%[a_ptr0]]\n"
2745 "ldr q3, [a_ptr1]\n"
2746 "ldr q5, [a_ptr2]\n"
2747 "ldr q7, [a_ptr3]\n"
2748 "ldr q9, [a_ptr4]\n"
2749 "ldr q11, [a_ptr5]\n"
2750 "ldr q13, [a_ptr6]\n"
2751 "ldr q15, [a_ptr7]\n"
2754 "ldr q0, [%[a_ptr0]], #0x10\n"
2755 "subs %[odds], %[odds], #0x1\n"
2756 "ldr q2, [a_ptr1], #0x10\n"
2757 "ldr q4, [a_ptr2], #0x10\n"
2758 "ldr d1, [%[a_ptr0]], #0x8\n"
2759 "ldr q6, [a_ptr3], #0x10\n"
2760 "ldr d3, [a_ptr1], #0x8\n"
2761 "ldr q8, [a_ptr4], #0x10\n"
2762 "ldr d5, [a_ptr2], #0x8\n"
2763 "ldr q10, [a_ptr5], #0x10\n"
2764 "ldr d7, [a_ptr3], #0x8\n"
2765 "ldr q12, [a_ptr6], #0x10\n"
2766 "ldr d9, [a_ptr4], #0x8\n"
2767 "ldr q14, [a_ptr7], #0x10\n"
2768 "ldr d11, [a_ptr5], #0x8\n"
2769 "ldr d13, [a_ptr6], #0x8\n"
2770 "ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
2771 "ldr d15, [a_ptr7], #0x8\n"
2772 "ld1 {v3.s}[2], [a_ptr1], #4\n"
2773 "ld1 {v5.s}[2], [a_ptr2], #4\n"
2774 "ld1 {v7.s}[2], [a_ptr3], #4\n"
2775 "ld1 {v9.s}[2], [a_ptr4], #4\n"
2776 "ld1 {v11.s}[2], [a_ptr5], #4\n"
2777 "ld1 {v13.s}[2], [a_ptr6], #4\n"
2778 "ld1 {v15.s}[2], [a_ptr7], #4\n"
2780 "ld1 {v1.b}[12], [%[a_ptr0]]\n"
2781 "ld1 {v3.b}[12], [a_ptr1]\n"
2782 "ld1 {v5.b}[12], [a_ptr2]\n"
2783 "ld1 {v7.b}[12], [a_ptr3]\n"
2784 "ld1 {v9.b}[12], [a_ptr4]\n"
2785 "ld1 {v11.b}[12], [a_ptr5]\n"
2786 "ld1 {v13.b}[12], [a_ptr6]\n"
2787 "ld1 {v15.b}[12], [a_ptr7]\n"
2790 "ld1 {v1.h}[6], [%[a_ptr0]], #2\n"
2791 "ld1 {v3.h}[6], [a_ptr1], #2\n"
2792 "ld1 {v5.h}[6], [a_ptr2], #2\n"
2793 "ld1 {v7.h}[6], [a_ptr3], #2\n"
2794 "ld1 {v9.h}[6], [a_ptr4], #2\n"
2795 "ld1 {v11.h}[6], [a_ptr5], #2\n"
2796 "ld1 {v13.h}[6], [a_ptr6], #2\n"
2797 "ld1 {v15.h}[6], [a_ptr7], #2\n"
2798 "subs %[odds], %[odds], #0x1\n"
2802 "ld1 {v1.b}[14], [%[a_ptr0]]\n"
2803 "ld1 {v3.b}[14], [a_ptr1]\n"
2804 "ld1 {v5.b}[14], [a_ptr2]\n"
2805 "ld1 {v7.b}[14], [a_ptr3]\n"
2806 "ld1 {v9.b}[14], [a_ptr4]\n"
2807 "ld1 {v11.b}[14], [a_ptr5]\n"
2808 "ld1 {v13.b}[14], [a_ptr6]\n"
2809 "ld1 {v15.b}[14], [a_ptr7]\n"
2811 "ldr q16, [%[b_ptr0]]\n"
2812 "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
2813 "ldr q17, [%[b_ptr0], #0x10]\n"
2814 "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
2815 "ldr q18, [%[b_ptr0], #0x20]\n"
2816 "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
2817 "ldr q19, [%[b_ptr0], #0x30]\n"
2818 "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
2819 "ldr q20, [%[b_ptr0], #0x40]\n"
2820 "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
2821 "ldr q21, [%[b_ptr0], #0x50]\n"
2822 "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
2823 "ldr q22, [%[b_ptr0], #0x60]\n"
2824 "ldr q23, [%[b_ptr0], #0x70]\n"
2825 "add %[b_ptr0], %[b_ptr0], #0x80\n"
2826 "cbz %[loops], 6f\n"
2828 "subs %[loops], %[loops], #0x1\n"
2836 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2837 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2838 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2839 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2840 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2841 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2842 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2843 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2844 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2845 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2846 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2847 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2848 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2849 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2850 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2851 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2852 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2853 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2854 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2855 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2856 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2857 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2858 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2859 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2860 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2861 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2862 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2863 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2864 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2865 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2866 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2867 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2868 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2869 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2870 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2871 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2872 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2873 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2874 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2875 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2876 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2877 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2878 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2879 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2880 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2881 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2882 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2883 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2884 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
2885 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
2886 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
2887 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
2888 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
2889 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
2890 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
2891 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
2892 ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
2893 ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
2894 ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
2895 ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
2896 ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
2897 ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
2898 ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
2899 ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
2902 "str q24, [%[c_ptr0]]\n"
2903 "subs %[loops], %[loops], #0x1\n"
2905 "ldr q16, [%[b_ptr0]]\n"
2906 "ldr q17, [%[b_ptr0], #0x10]\n"
2907 "add %[c_ptr0], %[c_ptr0], #0x10\n"
2908 "str q25, [c_ptr1]\n"
2909 "add c_ptr1, c_ptr1, #0x10\n"
2911 "ldr q18, [%[b_ptr0], #0x20]\n"
2912 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
2913 "str q26, [c_ptr2]\n"
2915 "ldr q19, [%[b_ptr0], #0x30]\n"
2916 "ldr q20, [%[b_ptr0], #0x40]\n"
2917 "add c_ptr2, c_ptr2, #0x10\n"
2918 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
2919 "str q27, [c_ptr3]\n"
2921 "ldr q21, [%[b_ptr0], #0x50]\n"
2922 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
2923 "ldr q22, [%[b_ptr0], #0x60]\n"
2924 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
2925 "str q28, [c_ptr4]\n"
2927 "ldr q23, [%[b_ptr0], #0x70]\n"
2928 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
2929 "add c_ptr3, c_ptr3, #0x10\n"
2930 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
2931 "str q29, [c_ptr5]\n"
2933 "add c_ptr4, c_ptr4, #0x10\n"
2934 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
2935 "str q30, [c_ptr6]\n"
2937 "add c_ptr5, c_ptr5, #0x10\n"
2938 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
2939 "str q31, [c_ptr7]\n"
2941 "add c_ptr6, c_ptr6, #0x10\n"
2942 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
2943 "add c_ptr7, c_ptr7, #0x10\n"
2944 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
2945 "add %[b_ptr0], %[b_ptr0], #0x80\n"
2946 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
2947 "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
2948 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
2949 "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
2950 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
2951 "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
2952 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
2953 "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
2954 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
2955 "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
2956 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
2957 "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
2958 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
2959 "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
2960 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
2961 "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
2962 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
2963 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
2964 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
2965 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
2966 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
2967 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
2968 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
2969 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
2970 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
2971 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
2972 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
2973 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
2974 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
2975 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
2976 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
2977 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
2978 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
2979 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
2980 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
2981 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
2982 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
2983 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
2984 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
2985 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
2986 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
2987 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
2988 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
2989 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
2990 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
2991 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
2992 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
2993 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
2994 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
2995 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
2996 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
2997 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
2998 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
2999 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
3000 ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
3001 ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
3002 ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
3003 ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
3004 ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
3005 ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
3006 ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
3007 ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
3010 "str q24, [%[c_ptr0]]\n"
3011 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3013 "ldr q16, [%[b_ptr0]]\n"
3014 "ldr q17, [%[b_ptr0], #0x10]\n"
3015 "str q25, [c_ptr1]\n"
3016 "add c_ptr1, c_ptr1, #0x10\n"
3018 "ldr q18, [%[b_ptr0], #0x20]\n"
3019 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
3020 "str q26, [c_ptr2]\n"
3022 "ldr q19, [%[b_ptr0], #0x30]\n"
3023 "ldr q20, [%[b_ptr0], #0x40]\n"
3024 "add c_ptr2, c_ptr2, #0x10\n"
3025 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
3026 "str q27, [c_ptr3]\n"
3028 "ldr q21, [%[b_ptr0], #0x50]\n"
3029 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
3030 "ldr q22, [%[b_ptr0], #0x60]\n"
3031 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
3032 "str q28, [c_ptr4]\n"
3034 "ldr q23, [%[b_ptr0], #0x70]\n"
3035 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
3036 "add c_ptr3, c_ptr3, #0x10\n"
3037 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
3038 "str q29, [c_ptr5]\n"
3040 "add c_ptr4, c_ptr4, #0x10\n"
3041 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
3042 "str q30, [c_ptr6]\n"
3044 "add c_ptr5, c_ptr5, #0x10\n"
3045 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
3046 "str q31, [c_ptr7]\n"
3048 "add c_ptr6, c_ptr6, #0x10\n"
3049 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
3050 "add c_ptr7, c_ptr7, #0x10\n"
3051 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
3052 "add %[b_ptr0], %[b_ptr0], #0x80\n"
3053 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
3054 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
3055 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
3056 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
3057 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
3058 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
3059 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
3060 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
3061 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
3062 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
3063 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
3064 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
3065 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
3066 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
3067 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
3068 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
3069 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
3070 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
3071 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
3072 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
3073 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
3074 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
3075 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
3076 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
3077 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
3078 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
3079 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
3080 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
3081 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
3082 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
3083 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
3084 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
3085 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
3086 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
3087 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
3088 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
3089 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
3090 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
3091 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
3092 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
3093 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
3094 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
3095 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
3096 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
3097 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
3098 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
3099 ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
3100 ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
3101 ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
3102 ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
3103 ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
3104 ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
3105 ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
3106 ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
3117 ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
3118 ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
3119 ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
3120 ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
3121 ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
3122 ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
3123 ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
3124 ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
3125 ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
3126 ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
3127 ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
3128 ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
3129 ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
3130 ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
3131 ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
3132 ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
3133 ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
3134 ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
3135 ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
3136 ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
3137 ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
3138 ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
3139 ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
3140 ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
3141 ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
3142 ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
3143 ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
3144 ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
3145 ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
3146 ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
3147 ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
3148 ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
3149 ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
3150 ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
3151 ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
3152 ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
3153 ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
3154 ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
3155 ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
3156 ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
3157 ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
3158 ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
3159 ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
3160 ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
3161 ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
3162 ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
3163 ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
3164 ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
3165 ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
3166 ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
3167 ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
3168 ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
3169 ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
3170 ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
3171 ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
3172 ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
3173 ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
3174 ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
3175 ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
3176 ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
3177 ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
3178 ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
3179 ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
3180 ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
3182 "str q24, [%[c_ptr0]]\n"
3183 "add %[c_ptr0], %[c_ptr0], #0x10\n"
3184 "str q25, [c_ptr1]\n"
3185 "str q26, [c_ptr2]\n"
3186 "str q27, [c_ptr3]\n"
3187 "str q28, [c_ptr4]\n"
3188 "str q29, [c_ptr5]\n"
3189 "str q30, [c_ptr6]\n"
3190 "str q31, [c_ptr7]\n"
3205 : [a_ptr0]
"+r" (a_ptr0), [b_ptr0]
"+r" (b_ptr0), [c_ptr0]
"+r" (c_ptr0), [loops]
"+r" (loops), [oob_rows]
"+r" (oob_rows), [odds]
"+r" (odds)
3206 : [lda]
"r" (ldab), [ldc]
"r" (ldcb)
3207 :
"x0",
"x1",
"x2",
"x3",
"x4",
"x5",
"x6",
"x7",
"x8",
"x9",
"x10",
"x11",
"x12",
"x13",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
"v8",
"v9",
"v10",
"v11",
"v12",
"v13",
"v14",
"v15",
"v16",
"v17",
"v18",
"v19",
"v20",
"v21",
"v22",
"v23",
"v24",
"v25",
"v26",
"v27",
"v28",
"v29",
"v30",
"v31",
"cc",
"memory"
3216 #endif // __aarch64__